@@ -20,4 +20,13 @@ auxiliary-submitters: | |||
- Zhenfei Zhang | |||
implementations: | |||
- name: clean | |||
version: 20190920 | |||
version: supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/6f6f4227/falcon | |||
- name: avx2 | |||
version: supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/6f6f4227/falcon | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 |
@@ -0,0 +1,24 @@ | |||
\ | |||
MIT License | |||
Copyright (c) 2017-2019 Falcon Project | |||
Permission is hereby granted, free of charge, to any person obtaining | |||
a copy of this software and associated documentation files (the | |||
"Software"), to deal in the Software without restriction, including | |||
without limitation the rights to use, copy, modify, merge, publish, | |||
distribute, sublicense, and/or sell copies of the Software, and to | |||
permit persons to whom the Software is furnished to do so, subject to | |||
the following conditions: | |||
The above copyright notice and this permission notice shall be | |||
included in all copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
@@ -0,0 +1,24 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libfalcon1024_avx2.a | |||
SOURCES = codec.c common.c fft.c fpr.c keygen.c pqclean.c rng.c sign.c vrfy.c | |||
OBJECTS = codec.o common.o fft.o fpr.o keygen.o pqclean.o rng.o sign.o vrfy.o | |||
HEADERS = api.h fpr.h inner.h | |||
CFLAGS=-O3 -Wconversion -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,80 @@ | |||
#ifndef PQCLEAN_FALCON1024_AVX2_API_H | |||
#define PQCLEAN_FALCON1024_AVX2_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES 2305 | |||
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES 1793 | |||
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES 1330 | |||
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_ALGNAME "Falcon-1024" | |||
/* | |||
* Generate a new key pair. Public key goes into pk[], private key in sk[]. | |||
* Key sizes are exact (in bytes): | |||
* public (pk): PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES | |||
* private (sk): PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_crypto_sign_keypair( | |||
uint8_t *pk, uint8_t *sk); | |||
/* | |||
* Compute a signature on a provided message (m, mlen), with a given | |||
* private key (sk). Signature is written in sig[], with length written | |||
* into *siglen. Signature length is variable; maximum signature length | |||
* (in bytes) is PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES. | |||
* | |||
* sig[], m[] and sk[] may overlap each other arbitrarily. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
/* | |||
* Verify a signature (sig, siglen) on a message (m, mlen) with a given | |||
* public key (pk). | |||
* | |||
* sig[], m[] and pk[] may overlap each other arbitrarily. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
/* | |||
* Compute a signature on a message and pack the signature and message | |||
* into a single object, written into sm[]. The length of that output is | |||
* written in *smlen; that length may be larger than the message length | |||
* (mlen) by up to PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES. | |||
* | |||
* sm[] and m[] may overlap each other arbitrarily; however, sm[] shall | |||
* not overlap with sk[]. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
/* | |||
* Open a signed message object (sm, smlen) and verify the signature; | |||
* on success, the message itself is written into m[] and its length | |||
* into *mlen. The message is shorter than the signed message object, | |||
* but the size difference depends on the signature value; the difference | |||
* may range up to PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES. | |||
* | |||
* m[], sm[] and pk[] may overlap each other arbitrarily. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,555 @@ | |||
#include "inner.h" | |||
/* | |||
* Encoding/decoding of keys and signatures. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_modq_encode( | |||
void *out, size_t max_out_len, | |||
const uint16_t *x, unsigned logn) { | |||
size_t n, out_len, u; | |||
uint8_t *buf; | |||
uint32_t acc; | |||
int acc_len; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] >= 12289) { | |||
return 0; | |||
} | |||
} | |||
out_len = ((n * 14) + 7) >> 3; | |||
if (out == NULL) { | |||
return out_len; | |||
} | |||
if (out_len > max_out_len) { | |||
return 0; | |||
} | |||
buf = out; | |||
acc = 0; | |||
acc_len = 0; | |||
for (u = 0; u < n; u ++) { | |||
acc = (acc << 14) | x[u]; | |||
acc_len += 14; | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
*buf ++ = (uint8_t)(acc >> acc_len); | |||
} | |||
} | |||
if (acc_len > 0) { | |||
*buf = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
return out_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_modq_decode( | |||
uint16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len) { | |||
size_t n, in_len, u; | |||
const uint8_t *buf; | |||
uint32_t acc; | |||
int acc_len; | |||
n = (size_t)1 << logn; | |||
in_len = ((n * 14) + 7) >> 3; | |||
if (in_len > max_in_len) { | |||
return 0; | |||
} | |||
buf = in; | |||
acc = 0; | |||
acc_len = 0; | |||
u = 0; | |||
while (u < n) { | |||
acc = (acc << 8) | (*buf ++); | |||
acc_len += 8; | |||
if (acc_len >= 14) { | |||
unsigned w; | |||
acc_len -= 14; | |||
w = (acc >> acc_len) & 0x3FFF; | |||
if (w >= 12289) { | |||
return 0; | |||
} | |||
x[u ++] = (uint16_t)w; | |||
} | |||
} | |||
if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { | |||
return 0; | |||
} | |||
return in_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_trim_i16_encode( | |||
void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn, unsigned bits) { | |||
size_t n, u, out_len; | |||
int minv, maxv; | |||
uint8_t *buf; | |||
uint32_t acc, mask; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
maxv = (1 << (bits - 1)) - 1; | |||
minv = -maxv; | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] < minv || x[u] > maxv) { | |||
return 0; | |||
} | |||
} | |||
out_len = ((n * bits) + 7) >> 3; | |||
if (out == NULL) { | |||
return out_len; | |||
} | |||
if (out_len > max_out_len) { | |||
return 0; | |||
} | |||
buf = out; | |||
acc = 0; | |||
acc_len = 0; | |||
mask = ((uint32_t)1 << bits) - 1; | |||
for (u = 0; u < n; u ++) { | |||
acc = (acc << bits) | ((uint16_t)x[u] & mask); | |||
acc_len += bits; | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
*buf ++ = (uint8_t)(acc >> acc_len); | |||
} | |||
} | |||
if (acc_len > 0) { | |||
*buf ++ = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
return out_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_trim_i16_decode( | |||
int16_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len) { | |||
size_t n, in_len; | |||
const uint8_t *buf; | |||
size_t u; | |||
uint32_t acc, mask1, mask2; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
in_len = ((n * bits) + 7) >> 3; | |||
if (in_len > max_in_len) { | |||
return 0; | |||
} | |||
buf = in; | |||
u = 0; | |||
acc = 0; | |||
acc_len = 0; | |||
mask1 = ((uint32_t)1 << bits) - 1; | |||
mask2 = (uint32_t)1 << (bits - 1); | |||
while (u < n) { | |||
acc = (acc << 8) | *buf ++; | |||
acc_len += 8; | |||
while (acc_len >= bits && u < n) { | |||
uint32_t w; | |||
acc_len -= bits; | |||
w = (acc >> acc_len) & mask1; | |||
w |= -(w & mask2); | |||
if (w == -mask2) { | |||
/* | |||
* The -2^(bits-1) value is forbidden. | |||
*/ | |||
return 0; | |||
} | |||
w |= -(w & mask2); | |||
x[u ++] = (int16_t) * (int32_t *)&w; | |||
} | |||
} | |||
if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { | |||
/* | |||
* Extra bits in the last byte must be zero. | |||
*/ | |||
return 0; | |||
} | |||
return in_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_trim_i8_encode( | |||
void *out, size_t max_out_len, | |||
const int8_t *x, unsigned logn, unsigned bits) { | |||
size_t n, u, out_len; | |||
int minv, maxv; | |||
uint8_t *buf; | |||
uint32_t acc, mask; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
maxv = (1 << (bits - 1)) - 1; | |||
minv = -maxv; | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] < minv || x[u] > maxv) { | |||
return 0; | |||
} | |||
} | |||
out_len = ((n * bits) + 7) >> 3; | |||
if (out == NULL) { | |||
return out_len; | |||
} | |||
if (out_len > max_out_len) { | |||
return 0; | |||
} | |||
buf = out; | |||
acc = 0; | |||
acc_len = 0; | |||
mask = ((uint32_t)1 << bits) - 1; | |||
for (u = 0; u < n; u ++) { | |||
acc = (acc << bits) | ((uint8_t)x[u] & mask); | |||
acc_len += bits; | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
*buf ++ = (uint8_t)(acc >> acc_len); | |||
} | |||
} | |||
if (acc_len > 0) { | |||
*buf ++ = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
return out_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_trim_i8_decode( | |||
int8_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len) { | |||
size_t n, in_len; | |||
const uint8_t *buf; | |||
size_t u; | |||
uint32_t acc, mask1, mask2; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
in_len = ((n * bits) + 7) >> 3; | |||
if (in_len > max_in_len) { | |||
return 0; | |||
} | |||
buf = in; | |||
u = 0; | |||
acc = 0; | |||
acc_len = 0; | |||
mask1 = ((uint32_t)1 << bits) - 1; | |||
mask2 = (uint32_t)1 << (bits - 1); | |||
while (u < n) { | |||
acc = (acc << 8) | *buf ++; | |||
acc_len += 8; | |||
while (acc_len >= bits && u < n) { | |||
uint32_t w; | |||
acc_len -= bits; | |||
w = (acc >> acc_len) & mask1; | |||
w |= -(w & mask2); | |||
if (w == -mask2) { | |||
/* | |||
* The -2^(bits-1) value is forbidden. | |||
*/ | |||
return 0; | |||
} | |||
x[u ++] = (int8_t) * (int32_t *)&w; | |||
} | |||
} | |||
if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { | |||
/* | |||
* Extra bits in the last byte must be zero. | |||
*/ | |||
return 0; | |||
} | |||
return in_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_comp_encode( | |||
void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn) { | |||
uint8_t *buf; | |||
size_t n, u, v; | |||
uint32_t acc; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
buf = out; | |||
/* | |||
* Make sure that all values are within the -2047..+2047 range. | |||
*/ | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] < -2047 || x[u] > +2047) { | |||
return 0; | |||
} | |||
} | |||
acc = 0; | |||
acc_len = 0; | |||
v = 0; | |||
for (u = 0; u < n; u ++) { | |||
int t; | |||
unsigned w; | |||
/* | |||
* Get sign and absolute value of next integer; push the | |||
* sign bit. | |||
*/ | |||
acc <<= 1; | |||
t = x[u]; | |||
if (t < 0) { | |||
t = -t; | |||
acc |= 1; | |||
} | |||
w = (unsigned)t; | |||
/* | |||
* Push the low 7 bits of the absolute value. | |||
*/ | |||
acc <<= 7; | |||
acc |= w & 127u; | |||
w >>= 7; | |||
/* | |||
* We pushed exactly 8 bits. | |||
*/ | |||
acc_len += 8; | |||
/* | |||
* Push as many zeros as necessary, then a one. Since the | |||
* absolute value is at most 2047, w can only range up to | |||
* 15 at this point, thus we will add at most 16 bits | |||
* here. With the 8 bits above and possibly up to 7 bits | |||
* from previous iterations, we may go up to 31 bits, which | |||
* will fit in the accumulator, which is an uint32_t. | |||
*/ | |||
acc <<= (w + 1); | |||
acc |= 1; | |||
acc_len += w + 1; | |||
/* | |||
* Produce all full bytes. | |||
*/ | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
if (buf != NULL) { | |||
if (v >= max_out_len) { | |||
return 0; | |||
} | |||
buf[v] = (uint8_t)(acc >> acc_len); | |||
} | |||
v ++; | |||
} | |||
} | |||
/* | |||
* Flush remaining bits (if any). | |||
*/ | |||
if (acc_len > 0) { | |||
if (buf != NULL) { | |||
if (v >= max_out_len) { | |||
return 0; | |||
} | |||
buf[v] = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
v ++; | |||
} | |||
return v; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON1024_AVX2_comp_decode( | |||
int16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len) { | |||
const uint8_t *buf; | |||
size_t n, u, v; | |||
uint32_t acc; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
buf = in; | |||
acc = 0; | |||
acc_len = 0; | |||
v = 0; | |||
for (u = 0; u < n; u ++) { | |||
unsigned b, s, m; | |||
/* | |||
* Get next eight bits: sign and low seven bits of the | |||
* absolute value. | |||
*/ | |||
if (v >= max_in_len) { | |||
return 0; | |||
} | |||
acc = (acc << 8) | (uint32_t)buf[v ++]; | |||
b = acc >> acc_len; | |||
s = b & 128; | |||
m = b & 127; | |||
/* | |||
* Get next bits until a 1 is reached. | |||
*/ | |||
for (;;) { | |||
if (acc_len == 0) { | |||
if (v >= max_in_len) { | |||
return 0; | |||
} | |||
acc = (acc << 8) | (uint32_t)buf[v ++]; | |||
acc_len = 8; | |||
} | |||
acc_len --; | |||
if (((acc >> acc_len) & 1) != 0) { | |||
break; | |||
} | |||
m += 128; | |||
if (m > 2047) { | |||
return 0; | |||
} | |||
} | |||
x[u] = (int16_t) m; | |||
if (s) { | |||
x[u] = (int16_t) - x[u]; | |||
} | |||
} | |||
return v; | |||
} | |||
/* | |||
* Key elements and signatures are polynomials with small integer | |||
* coefficients. Here are some statistics gathered over many | |||
* generated key pairs (10000 or more for each degree): | |||
* | |||
* log(n) n max(f,g) std(f,g) max(F,G) std(F,G) | |||
* 1 2 129 56.31 143 60.02 | |||
* 2 4 123 40.93 160 46.52 | |||
* 3 8 97 28.97 159 38.01 | |||
* 4 16 100 21.48 154 32.50 | |||
* 5 32 71 15.41 151 29.36 | |||
* 6 64 59 11.07 138 27.77 | |||
* 7 128 39 7.91 144 27.00 | |||
* 8 256 32 5.63 148 26.61 | |||
* 9 512 22 4.00 137 26.46 | |||
* 10 1024 15 2.84 146 26.41 | |||
* | |||
* We want a compact storage format for private key, and, as part of | |||
* key generation, we are allowed to reject some keys which would | |||
* otherwise be fine (this does not induce any noticeable vulnerability | |||
* as long as we reject only a small proportion of possible keys). | |||
* Hence, we enforce at key generation time maximum values for the | |||
* elements of f, g, F and G, so that their encoding can be expressed | |||
* in fixed-width values. Limits have been chosen so that generated | |||
* keys are almost always within bounds, thus not impacting neither | |||
* security or performance. | |||
* | |||
* IMPORTANT: the code assumes that all coefficients of f, g, F and G | |||
* ultimately fit in the -127..+127 range. Thus, none of the elements | |||
* of max_fg_bits[] and max_FG_bits[] shall be greater than 8. | |||
*/ | |||
const uint8_t PQCLEAN_FALCON1024_AVX2_max_fg_bits[] = { | |||
0, /* unused */ | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
7, | |||
7, | |||
6, | |||
6, | |||
5 | |||
}; | |||
const uint8_t PQCLEAN_FALCON1024_AVX2_max_FG_bits[] = { | |||
0, /* unused */ | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8 | |||
}; | |||
/* | |||
* When generating a new key pair, we can always reject keys which | |||
* feature an abnormally large coefficient. This can also be done for | |||
* signatures, albeit with some care: in case the signature process is | |||
* used in a derandomized setup (explicitly seeded with the message and | |||
* private key), we have to follow the specification faithfully, and the | |||
* specification only enforces a limit on the L2 norm of the signature | |||
* vector. The limit on the L2 norm implies that the absolute value of | |||
* a coefficient of the signature cannot be more than the following: | |||
* | |||
* log(n) n max sig coeff (theoretical) | |||
* 1 2 412 | |||
* 2 4 583 | |||
* 3 8 824 | |||
* 4 16 1166 | |||
* 5 32 1649 | |||
* 6 64 2332 | |||
* 7 128 3299 | |||
* 8 256 4665 | |||
* 9 512 6598 | |||
* 10 1024 9331 | |||
* | |||
* However, the largest observed signature coefficients during our | |||
* experiments was 1077 (in absolute value), hence we can assume that, | |||
* with overwhelming probability, signature coefficients will fit | |||
* in -2047..2047, i.e. 12 bits. | |||
*/ | |||
const uint8_t PQCLEAN_FALCON1024_AVX2_max_sig_bits[] = { | |||
0, /* unused */ | |||
10, | |||
11, | |||
11, | |||
12, | |||
12, | |||
12, | |||
12, | |||
12, | |||
12, | |||
12 | |||
}; |
@@ -0,0 +1,294 @@ | |||
#include "inner.h" | |||
/* | |||
* Support functions for signatures (hash-to-point, norm). | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON1024_AVX2_hash_to_point_vartime( | |||
inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn) { | |||
/* | |||
* This is the straightforward per-the-spec implementation. It | |||
* is not constant-time, thus it might reveal information on the | |||
* plaintext (at least, enough to check the plaintext against a | |||
* list of potential plaintexts) in a scenario where the | |||
* attacker does not have access to the signature value or to | |||
* the public key, but knows the nonce (without knowledge of the | |||
* nonce, the hashed output cannot be matched against potential | |||
* plaintexts). | |||
*/ | |||
size_t n; | |||
n = (size_t)1 << logn; | |||
while (n > 0) { | |||
uint8_t buf[2]; | |||
uint32_t w; | |||
inner_shake256_extract(sc, (void *)buf, sizeof buf); | |||
w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; | |||
if (w < 61445) { | |||
while (w >= 12289) { | |||
w -= 12289; | |||
} | |||
*x ++ = (uint16_t)w; | |||
n --; | |||
} | |||
} | |||
} | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON1024_AVX2_hash_to_point_ct( | |||
inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn, uint8_t *tmp) { | |||
/* | |||
* Each 16-bit sample is a value in 0..65535. The value is | |||
* kept if it falls in 0..61444 (because 61445 = 5*12289) | |||
* and rejected otherwise; thus, each sample has probability | |||
* about 0.93758 of being selected. | |||
* | |||
* We want to oversample enough to be sure that we will | |||
* have enough values with probability at least 1 - 2^(-256). | |||
* Depending on degree N, this leads to the following | |||
* required oversampling: | |||
* | |||
* logn n oversampling | |||
* 1 2 65 | |||
* 2 4 67 | |||
* 3 8 71 | |||
* 4 16 77 | |||
* 5 32 86 | |||
* 6 64 100 | |||
* 7 128 122 | |||
* 8 256 154 | |||
* 9 512 205 | |||
* 10 1024 287 | |||
* | |||
* If logn >= 7, then the provided temporary buffer is large | |||
* enough. Otherwise, we use a stack buffer of 63 entries | |||
* (i.e. 126 bytes) for the values that do not fit in tmp[]. | |||
*/ | |||
static const uint16_t overtab[] = { | |||
0, /* unused */ | |||
65, | |||
67, | |||
71, | |||
77, | |||
86, | |||
100, | |||
122, | |||
154, | |||
205, | |||
287 | |||
}; | |||
unsigned n, n2, u, m, p, over; | |||
uint16_t *tt1, tt2[63]; | |||
/* | |||
* We first generate m 16-bit value. Values 0..n-1 go to x[]. | |||
* Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[]. | |||
* We also reduce modulo q the values; rejected values are set | |||
* to 0xFFFF. | |||
*/ | |||
n = 1U << logn; | |||
n2 = n << 1; | |||
over = overtab[logn]; | |||
m = n + over; | |||
tt1 = (uint16_t *)tmp; | |||
for (u = 0; u < m; u ++) { | |||
uint8_t buf[2]; | |||
uint32_t w, wr; | |||
inner_shake256_extract(sc, buf, sizeof buf); | |||
w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; | |||
wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); | |||
wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); | |||
wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1)); | |||
wr |= ((w - 61445) >> 31) - 1; | |||
if (u < n) { | |||
x[u] = (uint16_t)wr; | |||
} else if (u < n2) { | |||
tt1[u - n] = (uint16_t)wr; | |||
} else { | |||
tt2[u - n2] = (uint16_t)wr; | |||
} | |||
} | |||
/* | |||
* Now we must "squeeze out" the invalid values. We do this in | |||
* a logarithmic sequence of passes; each pass computes where a | |||
* value should go, and moves it down by 'p' slots if necessary, | |||
* where 'p' uses an increasing powers-of-two scale. It can be | |||
* shown that in all cases where the loop decides that a value | |||
* has to be moved down by p slots, the destination slot is | |||
* "free" (i.e. contains an invalid value). | |||
*/ | |||
for (p = 1; p <= over; p <<= 1) { | |||
unsigned v; | |||
/* | |||
* In the loop below: | |||
* | |||
* - v contains the index of the final destination of | |||
* the value; it is recomputed dynamically based on | |||
* whether values are valid or not. | |||
* | |||
* - u is the index of the value we consider ("source"); | |||
* its address is s. | |||
* | |||
* - The loop may swap the value with the one at index | |||
* u-p. The address of the swap destination is d. | |||
*/ | |||
v = 0; | |||
for (u = 0; u < m; u ++) { | |||
uint16_t *s, *d; | |||
unsigned j, sv, dv, mk; | |||
if (u < n) { | |||
s = &x[u]; | |||
} else if (u < n2) { | |||
s = &tt1[u - n]; | |||
} else { | |||
s = &tt2[u - n2]; | |||
} | |||
sv = *s; | |||
/* | |||
* The value in sv should ultimately go to | |||
* address v, i.e. jump back by u-v slots. | |||
*/ | |||
j = u - v; | |||
/* | |||
* We increment v for the next iteration, but | |||
* only if the source value is valid. The mask | |||
* 'mk' is -1 if the value is valid, 0 otherwise, | |||
* so we _subtract_ mk. | |||
*/ | |||
mk = (sv >> 15) - 1U; | |||
v -= mk; | |||
/* | |||
* In this loop we consider jumps by p slots; if | |||
* u < p then there is nothing more to do. | |||
*/ | |||
if (u < p) { | |||
continue; | |||
} | |||
/* | |||
* Destination for the swap: value at address u-p. | |||
*/ | |||
if ((u - p) < n) { | |||
d = &x[u - p]; | |||
} else if ((u - p) < n2) { | |||
d = &tt1[(u - p) - n]; | |||
} else { | |||
d = &tt2[(u - p) - n2]; | |||
} | |||
dv = *d; | |||
/* | |||
* The swap should be performed only if the source | |||
* is valid AND the jump j has its 'p' bit set. | |||
*/ | |||
mk &= -(((j & p) + 0x1FF) >> 9); | |||
*s = (uint16_t)(sv ^ (mk & (sv ^ dv))); | |||
*d = (uint16_t)(dv ^ (mk & (sv ^ dv))); | |||
} | |||
} | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_is_short( | |||
const int16_t *s1, const int16_t *s2, unsigned logn) { | |||
/* | |||
* We use the l2-norm. Code below uses only 32-bit operations to | |||
* compute the square of the norm with saturation to 2^32-1 if | |||
* the value exceeds 2^31-1. | |||
*/ | |||
size_t n, u; | |||
uint32_t s, ng; | |||
n = (size_t)1 << logn; | |||
s = 0; | |||
ng = 0; | |||
for (u = 0; u < n; u ++) { | |||
int32_t z; | |||
z = s1[u]; | |||
s += (uint32_t)(z * z); | |||
ng |= s; | |||
z = s2[u]; | |||
s += (uint32_t)(z * z); | |||
ng |= s; | |||
} | |||
s |= -(ng >> 31); | |||
/* | |||
* Acceptance bound on the l2-norm is: | |||
* 1.2*1.55*sqrt(q)*sqrt(2*N) | |||
* Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). | |||
*/ | |||
return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_is_short_half( | |||
uint32_t sqn, const int16_t *s2, unsigned logn) { | |||
size_t n, u; | |||
uint32_t ng; | |||
n = (size_t)1 << logn; | |||
ng = -(sqn >> 31); | |||
for (u = 0; u < n; u ++) { | |||
int32_t z; | |||
z = s2[u]; | |||
sqn += (uint32_t)(z * z); | |||
ng |= sqn; | |||
} | |||
sqn |= -(ng >> 31); | |||
/* | |||
* Acceptance bound on the l2-norm is: | |||
* 1.2*1.55*sqrt(q)*sqrt(2*N) | |||
* Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). | |||
*/ | |||
return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); | |||
} |
@@ -0,0 +1,349 @@ | |||
#ifndef PQCLEAN_FALCON1024_AVX2_FPR_H | |||
#define PQCLEAN_FALCON1024_AVX2_FPR_H | |||
/* | |||
* Floating-point operations. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* ====================================================================== */ | |||
#include <immintrin.h> | |||
#include <math.h> | |||
#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) | |||
#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) | |||
/* | |||
* We wrap the native 'double' type into a structure so that the C compiler | |||
* complains if we inadvertently use raw arithmetic operators on the 'fpr' | |||
* type instead of using the inline functions below. This should have no | |||
* extra runtime cost, since all the functions below are 'inline'. | |||
*/ | |||
typedef struct { | |||
double v; | |||
} fpr; | |||
static inline fpr | |||
FPR(double v) { | |||
fpr x; | |||
x.v = v; | |||
return x; | |||
} | |||
static inline fpr | |||
fpr_of(int64_t i) { | |||
return FPR((double)i); | |||
} | |||
static const fpr fpr_q = { 12289.0 }; | |||
static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 }; | |||
static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 }; | |||
static const fpr fpr_inv_sigma = { .005819826392951607426919370871 }; | |||
static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 }; | |||
static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 }; | |||
static const fpr fpr_log2 = { 0.69314718055994530941723212146 }; | |||
static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 }; | |||
static const fpr fpr_bnorm_max = { 16822.4121 }; | |||
static const fpr fpr_zero = { 0.0 }; | |||
static const fpr fpr_one = { 1.0 }; | |||
static const fpr fpr_two = { 2.0 }; | |||
static const fpr fpr_onehalf = { 0.5 }; | |||
static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 }; | |||
static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 }; | |||
static const fpr fpr_ptwo31 = { 2147483648.0 }; | |||
static const fpr fpr_ptwo31m1 = { 2147483647.0 }; | |||
static const fpr fpr_mtwo31m1 = { -2147483647.0 }; | |||
static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 }; | |||
static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 }; | |||
static const fpr fpr_ptwo63 = { 9223372036854775808.0 }; | |||
static inline int64_t | |||
fpr_rint(fpr x) { | |||
/* | |||
* We do not want to use llrint() since it might be not | |||
* constant-time. | |||
* | |||
* Suppose that x >= 0. If x >= 2^52, then it is already an | |||
* integer. Otherwise, if x < 2^52, then computing x+2^52 will | |||
* yield a value that will be rounded to the nearest integer | |||
* with exactly the right rules (round-to-nearest-even). | |||
* | |||
* In order to have constant-time processing, we must do the | |||
* computation for both x >= 0 and x < 0 cases, and use a | |||
* cast to an integer to access the sign and select the proper | |||
* value. Such casts also allow us to find out if |x| < 2^52. | |||
*/ | |||
int64_t sx, tx, rp, rn, m; | |||
uint32_t ub; | |||
sx = (int64_t)(x.v - 1.0); | |||
tx = (int64_t)x.v; | |||
rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496; | |||
rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496; | |||
/* | |||
* If tx >= 2^52 or tx < -2^52, then result is tx. | |||
* Otherwise, if sx >= 0, then result is rp. | |||
* Otherwise, result is rn. We use the fact that when x is | |||
* close to 0 (|x| <= 0.25) then both rp and rn are correct; | |||
* and if x is not close to 0, then trunc(x-1.0) yields the | |||
* appropriate sign. | |||
*/ | |||
/* | |||
* Clamp rp to zero if tx < 0. | |||
* Clamp rn to zero if tx >= 0. | |||
*/ | |||
m = sx >> 63; | |||
rn &= m; | |||
rp &= ~m; | |||
/* | |||
* Get the 12 upper bits of tx; if they are not all zeros or | |||
* all ones, then tx >= 2^52 or tx < -2^52, and we clamp both | |||
* rp and rn to zero. Otherwise, we clamp tx to zero. | |||
*/ | |||
ub = (uint32_t)((uint64_t)tx >> 52); | |||
m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31); | |||
rp &= m; | |||
rn &= m; | |||
tx &= ~m; | |||
/* | |||
* Only one of tx, rn or rp (at most) can be non-zero at this | |||
* point. | |||
*/ | |||
return tx | rn | rp; | |||
} | |||
static inline int64_t | |||
fpr_floor(fpr x) { | |||
int64_t r; | |||
/* | |||
* The cast performs a trunc() (rounding toward 0) and thus is | |||
* wrong by 1 for most negative values. The correction below is | |||
* constant-time as long as the compiler turns the | |||
* floating-point conversion result into a 0/1 integer without a | |||
* conditional branch or another non-constant-time construction. | |||
* This should hold on all modern architectures with an FPU (and | |||
* if it is false on a given arch, then chances are that the FPU | |||
* itself is not constant-time, making the point moot). | |||
*/ | |||
r = (int64_t)x.v; | |||
return r - (x.v < (double)r); | |||
} | |||
static inline int64_t | |||
fpr_trunc(fpr x) { | |||
return (int64_t)x.v; | |||
} | |||
static inline fpr | |||
fpr_add(fpr x, fpr y) { | |||
return FPR(x.v + y.v); | |||
} | |||
static inline fpr | |||
fpr_sub(fpr x, fpr y) { | |||
return FPR(x.v - y.v); | |||
} | |||
static inline fpr | |||
fpr_neg(fpr x) { | |||
return FPR(-x.v); | |||
} | |||
static inline fpr | |||
fpr_half(fpr x) { | |||
return FPR(x.v * 0.5); | |||
} | |||
static inline fpr | |||
fpr_double(fpr x) { | |||
return FPR(x.v + x.v); | |||
} | |||
static inline fpr | |||
fpr_mul(fpr x, fpr y) { | |||
return FPR(x.v * y.v); | |||
} | |||
static inline fpr | |||
fpr_sqr(fpr x) { | |||
return FPR(x.v * x.v); | |||
} | |||
static inline fpr | |||
fpr_inv(fpr x) { | |||
return FPR(1.0 / x.v); | |||
} | |||
static inline fpr | |||
fpr_div(fpr x, fpr y) { | |||
return FPR(x.v / y.v); | |||
} | |||
static inline void | |||
fpr_sqrt_avx2(double *t) { | |||
__m128d x; | |||
x = _mm_load1_pd(t); | |||
x = _mm_sqrt_pd(x); | |||
_mm_storel_pd(t, x); | |||
} | |||
static inline fpr | |||
fpr_sqrt(fpr x) { | |||
/* | |||
* We prefer not to have a dependency on libm when it can be | |||
* avoided. On x86, calling the sqrt() libm function inlines | |||
* the relevant opcode (fsqrt or sqrtsd, depending on whether | |||
* the 387 FPU or SSE2 is used for floating-point operations) | |||
* but then makes an optional call to the library function | |||
* for proper error handling, in case the operand is negative. | |||
* | |||
* To avoid this dependency, we use intrinsics or inline assembly | |||
* on recognized platforms: | |||
* | |||
* - If AVX2 is explicitly enabled, then we use SSE2 intrinsics. | |||
* | |||
* - On GCC/Clang with SSE maths, we use SSE2 intrinsics. | |||
* | |||
* - On GCC/Clang on i386, or MSVC on i386, we use inline assembly | |||
* to call the 387 FPU fsqrt opcode. | |||
* | |||
* - On GCC/Clang/XLC on PowerPC, we use inline assembly to call | |||
* the fsqrt opcode (Clang needs a special hack). | |||
* | |||
* - On GCC/Clang on ARM with hardware floating-point, we use | |||
* inline assembly to call the vqsrt.f64 opcode. Due to a | |||
* complex ecosystem of compilers and assembly syntaxes, we | |||
* have to call it "fsqrt" or "fsqrtd", depending on case. | |||
* | |||
* If the platform is not recognized, a call to the system | |||
* library function sqrt() is performed. On some compilers, this | |||
* may actually inline the relevant opcode, and call the library | |||
* function only when the input is invalid (e.g. negative); | |||
* Falcon never actually calls sqrt() on a negative value, but | |||
* the dependency to libm will still be there. | |||
*/ | |||
fpr_sqrt_avx2(&x.v); | |||
return x; | |||
} | |||
static inline int | |||
fpr_lt(fpr x, fpr y) { | |||
return x.v < y.v; | |||
} | |||
static inline uint64_t | |||
fpr_expm_p63(fpr x, fpr ccs) { | |||
/* | |||
* Polynomial approximation of exp(-x) is taken from FACCT: | |||
* https://eprint.iacr.org/2018/1234 | |||
* Specifically, values are extracted from the implementation | |||
* referenced from the FACCT article, and available at: | |||
* https://github.com/raykzhao/gaussian | |||
* Tests over more than 24 billions of random inputs in the | |||
* 0..log(2) range have never shown a deviation larger than | |||
* 2^(-50) from the true mathematical value. | |||
*/ | |||
/* | |||
* AVX2 implementation uses more operations than Horner's method, | |||
* but with a lower expression tree depth. This helps because | |||
* additions and multiplications have a latency of 4 cycles on | |||
* a Skylake, but the CPU can issue two of them per cycle. | |||
*/ | |||
static const union { | |||
double d[12]; | |||
__m256d v[3]; | |||
} c = { | |||
{ | |||
0.999999999999994892974086724280, | |||
0.500000000000019206858326015208, | |||
0.166666666666984014666397229121, | |||
0.041666666666110491190622155955, | |||
0.008333333327800835146903501993, | |||
0.001388888894063186997887560103, | |||
0.000198412739277311890541063977, | |||
0.000024801566833585381209939524, | |||
0.000002755586350219122514855659, | |||
0.000000275607356160477811864927, | |||
0.000000025299506379442070029551, | |||
0.000000002073772366009083061987 | |||
} | |||
}; | |||
double d1, d2, d4, d8, y; | |||
__m256d d14, d58, d9c; | |||
d1 = -x.v; | |||
d2 = d1 * d1; | |||
d4 = d2 * d2; | |||
d8 = d4 * d4; | |||
d14 = _mm256_set_pd(d4, d2 * d1, d2, d1); | |||
d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4)); | |||
d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8)); | |||
d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0])); | |||
d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14); | |||
d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58); | |||
d9c = _mm256_hadd_pd(d9c, d9c); | |||
y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c) | |||
+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1)); | |||
y *= ccs.v; | |||
/* | |||
* Final conversion goes through int64_t first, because that's what | |||
* the underlying opcode (vcvttsd2si) will do, and we know that the | |||
* result will fit, since x >= 0 and ccs < 1. If we did the | |||
* conversion directly to uint64_t, then the compiler would add some | |||
* extra code to cover the case of a source value of 2^63 or more, | |||
* and though the alternate path would never be exercised, the | |||
* extra comparison would cost us some cycles. | |||
*/ | |||
return (uint64_t)(int64_t)(y * fpr_ptwo63.v); | |||
} | |||
#define fpr_gm_tab PQCLEAN_FALCON1024_AVX2_fpr_gm_tab | |||
extern const fpr fpr_gm_tab[]; | |||
#define fpr_p2_tab PQCLEAN_FALCON1024_AVX2_fpr_p2_tab | |||
extern const fpr fpr_p2_tab[]; | |||
/* ====================================================================== */ | |||
#endif |
@@ -0,0 +1,826 @@ | |||
#ifndef PQCLEAN_FALCON1024_AVX2_INNER_H | |||
#define PQCLEAN_FALCON1024_AVX2_INNER_H | |||
/* | |||
* Internal functions for Falcon. This is not the API intended to be | |||
* used by applications; instead, this internal API provides all the | |||
* primitives on which wrappers build to provide external APIs. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* | |||
* IMPORTANT API RULES | |||
* ------------------- | |||
* | |||
* This API has some non-trivial usage rules: | |||
* | |||
* | |||
* - All public functions (i.e. the non-static ones) must be referenced | |||
* with the PQCLEAN_FALCON1024_AVX2_ macro (e.g. PQCLEAN_FALCON1024_AVX2_verify_raw for the verify_raw() | |||
* function). That macro adds a prefix to the name, which is | |||
* configurable with the FALCON_PREFIX macro. This allows compiling | |||
* the code into a specific "namespace" and potentially including | |||
* several versions of this code into a single application (e.g. to | |||
* have an AVX2 and a non-AVX2 variants and select the one to use at | |||
* runtime based on availability of AVX2 opcodes). | |||
* | |||
* - Functions that need temporary buffers expects them as a final | |||
* tmp[] array of type uint8_t*, with a size which is documented for | |||
* each function. However, most have some alignment requirements, | |||
* because they will use the array to store 16-bit, 32-bit or 64-bit | |||
* values (e.g. uint64_t or double). The caller must ensure proper | |||
* alignment. What happens on unaligned access depends on the | |||
* underlying architecture, ranging from a slight time penalty | |||
* to immediate termination of the process. | |||
* | |||
* - Some functions rely on specific rounding rules and precision for | |||
* floating-point numbers. On some systems (in particular 32-bit x86 | |||
* with the 387 FPU), this requires setting an hardware control | |||
* word. The caller MUST use set_fpu_cw() to ensure proper precision: | |||
* | |||
* oldcw = set_fpu_cw(2); | |||
* PQCLEAN_FALCON1024_AVX2_sign_dyn(...); | |||
* set_fpu_cw(oldcw); | |||
* | |||
* On systems where the native floating-point precision is already | |||
* proper, or integer-based emulation is used, the set_fpu_cw() | |||
* function does nothing, so it can be called systematically. | |||
*/ | |||
#include "fips202.h" | |||
#include "fpr.h" | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
/* | |||
* Some computations with floating-point elements, in particular | |||
* rounding to the nearest integer, rely on operations using _exactly_ | |||
* the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit | |||
* x86, the 387 FPU may be used (depending on the target OS) and, in | |||
* that case, may use more precision bits (i.e. 64 bits, for an 80-bit | |||
* total type length); to prevent miscomputations, we define an explicit | |||
* function that modifies the precision in the FPU control word. | |||
* | |||
* set_fpu_cw() sets the precision to the provided value, and returns | |||
* the previously set precision; callers are supposed to restore the | |||
* previous precision on exit. The correct (52-bit) precision is | |||
* configured with the value "2". On unsupported compilers, or on | |||
* targets other than 32-bit x86, or when the native 'double' type is | |||
* not used, the set_fpu_cw() function does nothing at all. | |||
*/ | |||
static inline unsigned | |||
set_fpu_cw(unsigned x) { | |||
return x; | |||
} | |||
/* ==================================================================== */ | |||
/* | |||
* SHAKE256 implementation (shake.c). | |||
* | |||
* API is defined to be easily replaced with the fips202.h API defined | |||
* as part of PQClean. | |||
*/ | |||
#define inner_shake256_context shake256incctx | |||
#define inner_shake256_init(sc) shake256_inc_init(sc) | |||
#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) | |||
#define inner_shake256_flip(sc) shake256_inc_finalize(sc) | |||
#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) | |||
#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc) | |||
/* ==================================================================== */ | |||
/* | |||
* Encoding/decoding functions (codec.c). | |||
* | |||
* Encoding functions take as parameters an output buffer (out) with | |||
* a given maximum length (max_out_len); returned value is the actual | |||
* number of bytes which have been written. If the output buffer is | |||
* not large enough, then 0 is returned (some bytes may have been | |||
* written to the buffer). If 'out' is NULL, then 'max_out_len' is | |||
* ignored; instead, the function computes and returns the actual | |||
* required output length (in bytes). | |||
* | |||
* Decoding functions take as parameters an input buffer (in) with | |||
* its maximum length (max_in_len); returned value is the actual number | |||
* of bytes that have been read from the buffer. If the provided length | |||
* is too short, then 0 is returned. | |||
* | |||
* Values to encode or decode are vectors of integers, with N = 2^logn | |||
* elements. | |||
* | |||
* Three encoding formats are defined: | |||
* | |||
* - modq: sequence of values modulo 12289, each encoded over exactly | |||
* 14 bits. The encoder and decoder verify that integers are within | |||
* the valid range (0..12288). Values are arrays of uint16. | |||
* | |||
* - trim: sequence of signed integers, a specified number of bits | |||
* each. The number of bits is provided as parameter and includes | |||
* the sign bit. Each integer x must be such that |x| < 2^(bits-1) | |||
* (which means that the -2^(bits-1) value is forbidden); encode and | |||
* decode functions check that property. Values are arrays of | |||
* int16_t or int8_t, corresponding to names 'trim_i16' and | |||
* 'trim_i8', respectively. | |||
* | |||
* - comp: variable-length encoding for signed integers; each integer | |||
* uses a minimum of 9 bits, possibly more. This is normally used | |||
* only for signatures. | |||
* | |||
*/ | |||
size_t PQCLEAN_FALCON1024_AVX2_modq_encode(void *out, size_t max_out_len, | |||
const uint16_t *x, unsigned logn); | |||
size_t PQCLEAN_FALCON1024_AVX2_trim_i16_encode(void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn, unsigned bits); | |||
size_t PQCLEAN_FALCON1024_AVX2_trim_i8_encode(void *out, size_t max_out_len, | |||
const int8_t *x, unsigned logn, unsigned bits); | |||
size_t PQCLEAN_FALCON1024_AVX2_comp_encode(void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn); | |||
size_t PQCLEAN_FALCON1024_AVX2_modq_decode(uint16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len); | |||
size_t PQCLEAN_FALCON1024_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len); | |||
size_t PQCLEAN_FALCON1024_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len); | |||
size_t PQCLEAN_FALCON1024_AVX2_comp_decode(int16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len); | |||
/* | |||
* Number of bits for key elements, indexed by logn (1 to 10). This | |||
* is at most 8 bits for all degrees, but some degrees may have shorter | |||
* elements. | |||
*/ | |||
extern const uint8_t PQCLEAN_FALCON1024_AVX2_max_fg_bits[]; | |||
extern const uint8_t PQCLEAN_FALCON1024_AVX2_max_FG_bits[]; | |||
/* | |||
* Maximum size, in bits, of elements in a signature, indexed by logn | |||
* (1 to 10). The size includes the sign bit. | |||
*/ | |||
extern const uint8_t PQCLEAN_FALCON1024_AVX2_max_sig_bits[]; | |||
/* ==================================================================== */ | |||
/* | |||
* Support functions used for both signature generation and signature | |||
* verification (common.c). | |||
*/ | |||
/* | |||
* From a SHAKE256 context (must be already flipped), produce a new | |||
* point. This is the non-constant-time version, which may leak enough | |||
* information to serve as a stop condition on a brute force attack on | |||
* the hashed message (provided that the nonce value is known). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_hash_to_point_vartime(inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn); | |||
/* | |||
* From a SHAKE256 context (must be already flipped), produce a new | |||
* point. The temporary buffer (tmp) must have room for 2*2^logn bytes. | |||
* This function is constant-time but is typically more expensive than | |||
* PQCLEAN_FALCON1024_AVX2_hash_to_point_vartime(). | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_hash_to_point_ct(inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Tell whether a given vector (2N coordinates, in two halves) is | |||
* acceptable as a signature. This compares the appropriate norm of the | |||
* vector with the acceptance bound. Returned value is 1 on success | |||
* (vector is short enough to be acceptable), 0 otherwise. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn); | |||
/* | |||
* Tell whether a given vector (2N coordinates, in two halves) is | |||
* acceptable as a signature. Instead of the first half s1, this | |||
* function receives the "saturated squared norm" of s1, i.e. the | |||
* sum of the squares of the coordinates of s1 (saturated at 2^32-1 | |||
* if the sum exceeds 2^31-1). | |||
* | |||
* Returned value is 1 on success (vector is short enough to be | |||
* acceptable), 0 otherwise. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn); | |||
/* ==================================================================== */ | |||
/* | |||
* Signature verification functions (vrfy.c). | |||
*/ | |||
/* | |||
* Convert a public key to NTT + Montgomery format. Conversion is done | |||
* in place. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_to_ntt_monty(uint16_t *h, unsigned logn); | |||
/* | |||
* Internal signature verification code: | |||
* c0[] contains the hashed nonce+message | |||
* s2[] is the decoded signature | |||
* h[] contains the public key, in NTT + Montgomery format | |||
* logn is the degree log | |||
* tmp[] temporary, must have at least 2*2^logn bytes | |||
* Returned value is 1 on success, 0 on error. | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2, | |||
const uint16_t *h, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Compute the public key h[], given the private key elements f[] and | |||
* g[]. This computes h = g/f mod phi mod q, where phi is the polynomial | |||
* modulus. This function returns 1 on success, 0 on error (an error is | |||
* reported if f is not invertible mod phi mod q). | |||
* | |||
* The tmp[] array must have room for at least 2*2^logn elements. | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_compute_public(uint16_t *h, | |||
const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Recompute the fourth private key element. Private key consists in | |||
* four polynomials with small coefficients f, g, F and G, which are | |||
* such that fG - gF = q mod phi; furthermore, f is invertible modulo | |||
* phi and modulo q. This function recomputes G from f, g and F. | |||
* | |||
* The tmp[] array must have room for at least 4*2^logn bytes. | |||
* | |||
* Returned value is 1 in success, 0 on error (f not invertible). | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_complete_private(int8_t *G, | |||
const int8_t *f, const int8_t *g, const int8_t *F, | |||
unsigned logn, uint8_t *tmp); | |||
/* | |||
* Test whether a given polynomial is invertible modulo phi and q. | |||
* Polynomial coefficients are small integers. | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_is_invertible( | |||
const int16_t *s2, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Count the number of elements of value zero in the NTT representation | |||
* of the given polynomial: this is the number of primitive 2n-th roots | |||
* of unity (modulo q = 12289) that are roots of the provided polynomial | |||
* (taken modulo q). | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Internal signature verification with public key recovery: | |||
* h[] receives the public key (NOT in NTT/Montgomery format) | |||
* c0[] contains the hashed nonce+message | |||
* s1[] is the first signature half | |||
* s2[] is the second signature half | |||
* logn is the degree log | |||
* tmp[] temporary, must have at least 2*2^logn bytes | |||
* Returned value is 1 on success, 0 on error. Success is returned if | |||
* the signature is a short enough vector; in that case, the public | |||
* key has been written to h[]. However, the caller must still | |||
* verify that h[] is the correct value (e.g. with regards to a known | |||
* hash of the public key). | |||
* | |||
* h[] may not overlap with any of the other arrays. | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_verify_recover(uint16_t *h, | |||
const uint16_t *c0, const int16_t *s1, const int16_t *s2, | |||
unsigned logn, uint8_t *tmp); | |||
/* ==================================================================== */ | |||
/* | |||
* Implementation of floating-point real numbers (fpr.h, fpr.c). | |||
*/ | |||
/* | |||
* Real numbers are implemented by an extra header file, included below. | |||
* This is meant to support pluggable implementations. The default | |||
* implementation relies on the C type 'double'. | |||
* | |||
* The included file must define the following types, functions and | |||
* constants: | |||
* | |||
* fpr | |||
* type for a real number | |||
* | |||
* fpr fpr_of(int64_t i) | |||
* cast an integer into a real number; source must be in the | |||
* -(2^63-1)..+(2^63-1) range | |||
* | |||
* fpr fpr_scaled(int64_t i, int sc) | |||
* compute i*2^sc as a real number; source 'i' must be in the | |||
* -(2^63-1)..+(2^63-1) range | |||
* | |||
* fpr fpr_ldexp(fpr x, int e) | |||
* compute x*2^e | |||
* | |||
* int64_t fpr_rint(fpr x) | |||
* round x to the nearest integer; x must be in the -(2^63-1) | |||
* to +(2^63-1) range | |||
* | |||
* int64_t fpr_trunc(fpr x) | |||
* round to an integer; this rounds towards zero; value must | |||
* be in the -(2^63-1) to +(2^63-1) range | |||
* | |||
* fpr fpr_add(fpr x, fpr y) | |||
* compute x + y | |||
* | |||
* fpr fpr_sub(fpr x, fpr y) | |||
* compute x - y | |||
* | |||
* fpr fpr_neg(fpr x) | |||
* compute -x | |||
* | |||
* fpr fpr_half(fpr x) | |||
* compute x/2 | |||
* | |||
* fpr fpr_double(fpr x) | |||
* compute x*2 | |||
* | |||
* fpr fpr_mul(fpr x, fpr y) | |||
* compute x * y | |||
* | |||
* fpr fpr_sqr(fpr x) | |||
* compute x * x | |||
* | |||
* fpr fpr_inv(fpr x) | |||
* compute 1/x | |||
* | |||
* fpr fpr_div(fpr x, fpr y) | |||
* compute x/y | |||
* | |||
* fpr fpr_sqrt(fpr x) | |||
* compute the square root of x | |||
* | |||
* int fpr_lt(fpr x, fpr y) | |||
* return 1 if x < y, 0 otherwise | |||
* | |||
* uint64_t fpr_expm_p63(fpr x) | |||
* return exp(x), assuming that 0 <= x < log(2). Returned value | |||
* is scaled to 63 bits (i.e. it really returns 2^63*exp(-x), | |||
* rounded to the nearest integer). Computation should have a | |||
* precision of at least 45 bits. | |||
* | |||
* const fpr fpr_gm_tab[] | |||
* array of constants for FFT / iFFT | |||
* | |||
* const fpr fpr_p2_tab[] | |||
* precomputed powers of 2 (by index, 0 to 10) | |||
* | |||
* Constants of type 'fpr': | |||
* | |||
* fpr fpr_q 12289 | |||
* fpr fpr_inverse_of_q 1/12289 | |||
* fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2)) | |||
* fpr fpr_inv_sigma 1/(1.55*sqrt(12289)) | |||
* fpr fpr_sigma_min_9 1.291500756233514568549480827642 | |||
* fpr fpr_sigma_min_10 1.311734375905083682667395805765 | |||
* fpr fpr_log2 log(2) | |||
* fpr fpr_inv_log2 1/log(2) | |||
* fpr fpr_bnorm_max 16822.4121 | |||
* fpr fpr_zero 0 | |||
* fpr fpr_one 1 | |||
* fpr fpr_two 2 | |||
* fpr fpr_onehalf 0.5 | |||
* fpr fpr_ptwo31 2^31 | |||
* fpr fpr_ptwo31m1 2^31-1 | |||
* fpr fpr_mtwo31m1 -(2^31-1) | |||
* fpr fpr_ptwo63m1 2^63-1 | |||
* fpr fpr_mtwo63m1 -(2^63-1) | |||
* fpr fpr_ptwo63 2^63 | |||
*/ | |||
/* ==================================================================== */ | |||
/* | |||
* RNG (rng.c). | |||
* | |||
* A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256 | |||
* context (flipped) and is used for bulk pseudorandom generation. | |||
* A system-dependent seed generator is also provided. | |||
*/ | |||
/* | |||
* Obtain a random seed from the system RNG. | |||
* | |||
* Returned value is 1 on success, 0 on error. | |||
*/ | |||
int PQCLEAN_FALCON1024_AVX2_get_seed(void *seed, size_t seed_len); | |||
/* | |||
* Structure for a PRNG. This includes a large buffer so that values | |||
* get generated in advance. The 'state' is used to keep the current | |||
* PRNG algorithm state (contents depend on the selected algorithm). | |||
* | |||
* The unions with 'dummy_u64' are there to ensure proper alignment for | |||
* 64-bit direct access. | |||
*/ | |||
typedef struct { | |||
union { | |||
uint8_t d[512]; /* MUST be 512, exactly */ | |||
uint64_t dummy_u64; | |||
} buf; | |||
size_t ptr; | |||
union { | |||
uint8_t d[256]; | |||
uint64_t dummy_u64; | |||
} state; | |||
int type; | |||
} prng; | |||
/* | |||
* Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 | |||
* context (in "flipped" state) to obtain its initial state. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_prng_init(prng *p, inner_shake256_context *src); | |||
/* | |||
* Refill the PRNG buffer. This is normally invoked automatically, and | |||
* is declared here only so that prng_get_u64() may be inlined. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_prng_refill(prng *p); | |||
/* | |||
* Get some bytes from a PRNG. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len); | |||
/* | |||
* Get a 64-bit random value from a PRNG. | |||
*/ | |||
static inline uint64_t | |||
prng_get_u64(prng *p) { | |||
size_t u; | |||
/* | |||
* If there are less than 9 bytes in the buffer, we refill it. | |||
* This means that we may drop the last few bytes, but this allows | |||
* for faster extraction code. Also, it means that we never leave | |||
* an empty buffer. | |||
*/ | |||
u = p->ptr; | |||
if (u >= (sizeof p->buf.d) - 9) { | |||
PQCLEAN_FALCON1024_AVX2_prng_refill(p); | |||
u = 0; | |||
} | |||
p->ptr = u + 8; | |||
return (uint64_t)p->buf.d[u + 0] | |||
| ((uint64_t)p->buf.d[u + 1] << 8) | |||
| ((uint64_t)p->buf.d[u + 2] << 16) | |||
| ((uint64_t)p->buf.d[u + 3] << 24) | |||
| ((uint64_t)p->buf.d[u + 4] << 32) | |||
| ((uint64_t)p->buf.d[u + 5] << 40) | |||
| ((uint64_t)p->buf.d[u + 6] << 48) | |||
| ((uint64_t)p->buf.d[u + 7] << 56); | |||
} | |||
/* | |||
* Get an 8-bit random value from a PRNG. | |||
*/ | |||
static inline unsigned | |||
prng_get_u8(prng *p) { | |||
unsigned v; | |||
v = p->buf.d[p->ptr ++]; | |||
if (p->ptr == sizeof p->buf.d) { | |||
PQCLEAN_FALCON1024_AVX2_prng_refill(p); | |||
} | |||
return v; | |||
} | |||
/* ==================================================================== */ | |||
/* | |||
* FFT (falcon-fft.c). | |||
* | |||
* A real polynomial is represented as an array of N 'fpr' elements. | |||
* The FFT representation of a real polynomial contains N/2 complex | |||
* elements; each is stored as two real numbers, for the real and | |||
* imaginary parts, respectively. See falcon-fft.c for details on the | |||
* internal representation. | |||
*/ | |||
/* | |||
* Compute FFT in-place: the source array should contain a real | |||
* polynomial (N coefficients); its storage area is reused to store | |||
* the FFT representation of that polynomial (N/2 complex numbers). | |||
* | |||
* 'logn' MUST lie between 1 and 10 (inclusive). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_FFT(fpr *f, unsigned logn); | |||
/* | |||
* Compute the inverse FFT in-place: the source array should contain the | |||
* FFT representation of a real polynomial (N/2 elements); the resulting | |||
* real polynomial (N coefficients of type 'fpr') is written over the | |||
* array. | |||
* | |||
* 'logn' MUST lie between 1 and 10 (inclusive). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_iFFT(fpr *f, unsigned logn); | |||
/* | |||
* Add polynomial b to polynomial a. a and b MUST NOT overlap. This | |||
* function works in both normal and FFT representations. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This | |||
* function works in both normal and FFT representations. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Negate polynomial a. This function works in both normal and FFT | |||
* representations. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_neg(fpr *a, unsigned logn); | |||
/* | |||
* Compute adjoint of polynomial a. This function works only in FFT | |||
* representation. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_adj_fft(fpr *a, unsigned logn); | |||
/* | |||
* Multiply polynomial a with polynomial b. a and b MUST NOT overlap. | |||
* This function works only in FFT representation. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT | |||
* overlap. This function works only in FFT representation. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Multiply polynomial with its own adjoint. This function works only in FFT | |||
* representation. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn); | |||
/* | |||
* Multiply polynomial with a real constant. This function works in both | |||
* normal and FFT representations. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn); | |||
/* | |||
* Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation). | |||
* a and b MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g)) | |||
* (also in FFT representation). Since the result is auto-adjoint, all its | |||
* coordinates in FFT representation are real; as such, only the first N/2 | |||
* values of d[] are filled (the imaginary parts are skipped). | |||
* | |||
* Array d MUST NOT overlap with either a or b. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_invnorm2_fft(fpr *d, | |||
const fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g) | |||
* (also in FFT representation). Destination d MUST NOT overlap with | |||
* any of the source arrays. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_add_muladj_fft(fpr *d, | |||
const fpr *F, const fpr *G, | |||
const fpr *f, const fpr *g, unsigned logn); | |||
/* | |||
* Multiply polynomial a by polynomial b, where b is autoadjoint. Both | |||
* a and b are in FFT representation. Since b is autoadjoint, all its | |||
* FFT coefficients are real, and the array b contains only N/2 elements. | |||
* a and b MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_mul_autoadj_fft(fpr *a, | |||
const fpr *b, unsigned logn); | |||
/* | |||
* Divide polynomial a by polynomial b, where b is autoadjoint. Both | |||
* a and b are in FFT representation. Since b is autoadjoint, all its | |||
* FFT coefficients are real, and the array b contains only N/2 elements. | |||
* a and b MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_div_autoadj_fft(fpr *a, | |||
const fpr *b, unsigned logn); | |||
/* | |||
* Perform an LDL decomposition of an auto-adjoint matrix G, in FFT | |||
* representation. On input, g00, g01 and g11 are provided (where the | |||
* matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10 | |||
* and d11 values are written in g00, g01 and g11, respectively | |||
* (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]). | |||
* (In fact, d00 = g00, so the g00 operand is left unmodified.) | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_LDL_fft(const fpr *g00, | |||
fpr *g01, fpr *g11, unsigned logn); | |||
/* | |||
* Perform an LDL decomposition of an auto-adjoint matrix G, in FFT | |||
* representation. This is identical to poly_LDL_fft() except that | |||
* g00, g01 and g11 are unmodified; the outputs d11 and l10 are written | |||
* in two other separate buffers provided as extra parameters. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10, | |||
const fpr *g00, const fpr *g01, | |||
const fpr *g11, unsigned logn); | |||
/* | |||
* Apply "split" operation on a polynomial in FFT representation: | |||
* f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1 | |||
* (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_split_fft(fpr *f0, fpr *f1, | |||
const fpr *f, unsigned logn); | |||
/* | |||
* Apply "merge" operation on two polynomials in FFT representation: | |||
* given f0 and f1, polynomials moduo X^(N/2)+1, this function computes | |||
* f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1. | |||
* f MUST NOT overlap with either f0 or f1. | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_poly_merge_fft(fpr *f, | |||
const fpr *f0, const fpr *f1, unsigned logn); | |||
/* ==================================================================== */ | |||
/* | |||
* Key pair generation. | |||
*/ | |||
/* | |||
* Required sizes of the temporary buffer (in bytes). | |||
* | |||
* This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 | |||
* or 2) where it is slightly greater. | |||
*/ | |||
#define FALCON_KEYGEN_TEMP_1 136 | |||
#define FALCON_KEYGEN_TEMP_2 272 | |||
#define FALCON_KEYGEN_TEMP_3 224 | |||
#define FALCON_KEYGEN_TEMP_4 448 | |||
#define FALCON_KEYGEN_TEMP_5 896 | |||
#define FALCON_KEYGEN_TEMP_6 1792 | |||
#define FALCON_KEYGEN_TEMP_7 3584 | |||
#define FALCON_KEYGEN_TEMP_8 7168 | |||
#define FALCON_KEYGEN_TEMP_9 14336 | |||
#define FALCON_KEYGEN_TEMP_10 28672 | |||
/* | |||
* Generate a new key pair. Randomness is extracted from the provided | |||
* SHAKE256 context, which must have already been seeded and flipped. | |||
* The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_* | |||
* macros) and be aligned for the uint32_t, uint64_t and fpr types. | |||
* | |||
* The private key elements are written in f, g, F and G, and the | |||
* public key is written in h. Either or both of G and h may be NULL, | |||
* in which case the corresponding element is not returned (they can | |||
* be recomputed from f, g and F). | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_keygen(inner_shake256_context *rng, | |||
int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, | |||
unsigned logn, uint8_t *tmp); | |||
/* ==================================================================== */ | |||
/* | |||
* Signature generation. | |||
*/ | |||
/* | |||
* Expand a private key into the B0 matrix in FFT representation and | |||
* the LDL tree. All the values are written in 'expanded_key', for | |||
* a total of (8*logn+40)*2^logn bytes. | |||
* | |||
* The tmp[] array must have room for at least 48*2^logn bytes. | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_expand_privkey(fpr *expanded_key, | |||
const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, | |||
unsigned logn, uint8_t *tmp); | |||
/* | |||
* Compute a signature over the provided hashed message (hm); the | |||
* signature value is one short vector. This function uses an | |||
* expanded key (as generated by PQCLEAN_FALCON1024_AVX2_expand_privkey()). | |||
* | |||
* The sig[] and hm[] buffers may overlap. | |||
* | |||
* On successful output, the start of the tmp[] buffer contains the s1 | |||
* vector (as int16_t elements). | |||
* | |||
* The minimal size (in bytes) of tmp[] is 48*2^logn bytes. | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng, | |||
const fpr *expanded_key, | |||
const uint16_t *hm, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Compute a signature over the provided hashed message (hm); the | |||
* signature value is one short vector. This function uses a raw | |||
* key and dynamically recompute the B0 matrix and LDL tree; this | |||
* saves RAM since there is no needed for an expanded key, but | |||
* increases the signature cost. | |||
* | |||
* The sig[] and hm[] buffers may overlap. | |||
* | |||
* On successful output, the start of the tmp[] buffer contains the s1 | |||
* vector (as int16_t elements). | |||
* | |||
* The minimal size (in bytes) of tmp[] is 72*2^logn bytes. | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON1024_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng, | |||
const int8_t *f, const int8_t *g, | |||
const int8_t *F, const int8_t *G, | |||
const uint16_t *hm, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Internal sampler engine. Exported for tests. | |||
* | |||
* sampler_context wraps around a source of random numbers (PRNG) and | |||
* the sigma_min value (nominally dependent on the degree). | |||
* | |||
* sampler() takes as parameters: | |||
* ctx pointer to the sampler_context structure | |||
* mu center for the distribution | |||
* isigma inverse of the distribution standard deviation | |||
* It returns an integer sampled along the Gaussian distribution centered | |||
* on mu and of standard deviation sigma = 1/isigma. | |||
* | |||
* gaussian0_sampler() takes as parameter a pointer to a PRNG, and | |||
* returns an integer sampled along a half-Gaussian with standard | |||
* deviation sigma0 = 1.8205 (center is 0, returned value is | |||
* nonnegative). | |||
*/ | |||
typedef struct { | |||
prng p; | |||
fpr sigma_min; | |||
} sampler_context; | |||
int PQCLEAN_FALCON1024_AVX2_sampler(void *ctx, fpr mu, fpr isigma); | |||
int PQCLEAN_FALCON1024_AVX2_gaussian0_sampler(prng *p); | |||
/* ==================================================================== */ | |||
#endif |
@@ -0,0 +1,386 @@ | |||
#include "api.h" | |||
#include "inner.h" | |||
#include "randombytes.h" | |||
#include <stddef.h> | |||
#include <string.h> | |||
/* | |||
* Wrapper for implementing the PQClean API. | |||
*/ | |||
#define NONCELEN 40 | |||
#define SEEDLEN 48 | |||
/* | |||
* Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024) | |||
* | |||
* private key: | |||
* header byte: 0101nnnn | |||
* private f (6 or 5 bits by element, depending on degree) | |||
* private g (6 or 5 bits by element, depending on degree) | |||
* private F (8 bits by element) | |||
* | |||
* public key: | |||
* header byte: 0000nnnn | |||
* public h (14 bits by element) | |||
* | |||
* signature: | |||
* header byte: 0011nnnn | |||
* nonce 40 bytes | |||
* value (12 bits by element) | |||
* | |||
* message + signature: | |||
* signature length (2 bytes, big-endian) | |||
* nonce 40 bytes | |||
* message | |||
* header byte: 0010nnnn | |||
* value (12 bits by element) | |||
* (signature length is 1+len(value), not counting the nonce) | |||
*/ | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { | |||
union { | |||
uint8_t b[28 * 1024]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
int8_t f[1024], g[1024], F[1024], G[1024]; | |||
uint16_t h[1024]; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context rng; | |||
size_t u, v; | |||
/* | |||
* Generate key pair. | |||
*/ | |||
randombytes(seed, sizeof seed); | |||
inner_shake256_init(&rng); | |||
inner_shake256_inject(&rng, seed, sizeof seed); | |||
inner_shake256_flip(&rng); | |||
PQCLEAN_FALCON1024_AVX2_keygen(&rng, f, g, F, G, h, 10, tmp.b); | |||
inner_shake256_ctx_release(&rng); | |||
/* | |||
* Encode private key. | |||
*/ | |||
sk[0] = 0x50 + 10; | |||
u = 1; | |||
v = PQCLEAN_FALCON1024_AVX2_trim_i8_encode( | |||
sk + u, PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES - u, | |||
f, 10, PQCLEAN_FALCON1024_AVX2_max_fg_bits[10]); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON1024_AVX2_trim_i8_encode( | |||
sk + u, PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES - u, | |||
g, 10, PQCLEAN_FALCON1024_AVX2_max_fg_bits[10]); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON1024_AVX2_trim_i8_encode( | |||
sk + u, PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES - u, | |||
F, 10, PQCLEAN_FALCON1024_AVX2_max_FG_bits[10]); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
if (u != PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES) { | |||
return -1; | |||
} | |||
/* | |||
* Encode public key. | |||
*/ | |||
pk[0] = 0x00 + 10; | |||
v = PQCLEAN_FALCON1024_AVX2_modq_encode( | |||
pk + 1, PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1, | |||
h, 10); | |||
if (v != PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) { | |||
return -1; | |||
} | |||
return 0; | |||
} | |||
/* | |||
* Compute the signature. nonce[] receives the nonce and must have length | |||
* NONCELEN bytes. sigbuf[] receives the signature value (without nonce | |||
* or header byte), with *sigbuflen providing the maximum value length and | |||
* receiving the actual value length. | |||
* | |||
* If a signature could be computed but not encoded because it would | |||
* exceed the output buffer size, then a new signature is computed. If | |||
* the provided buffer size is too low, this could loop indefinitely, so | |||
* the caller must provide a size that can accommodate signatures with a | |||
* large enough probability. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
static int | |||
do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
union { | |||
uint8_t b[72 * 1024]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
int8_t f[1024], g[1024], F[1024], G[1024]; | |||
union { | |||
int16_t sig[1024]; | |||
uint16_t hm[1024]; | |||
} r; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context sc; | |||
size_t u, v; | |||
/* | |||
* Decode the private key. | |||
*/ | |||
if (sk[0] != 0x50 + 10) { | |||
return -1; | |||
} | |||
u = 1; | |||
v = PQCLEAN_FALCON1024_AVX2_trim_i8_decode( | |||
f, 10, PQCLEAN_FALCON1024_AVX2_max_fg_bits[10], | |||
sk + u, PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES - u); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON1024_AVX2_trim_i8_decode( | |||
g, 10, PQCLEAN_FALCON1024_AVX2_max_fg_bits[10], | |||
sk + u, PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES - u); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON1024_AVX2_trim_i8_decode( | |||
F, 10, PQCLEAN_FALCON1024_AVX2_max_FG_bits[10], | |||
sk + u, PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES - u); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
if (u != PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES) { | |||
return -1; | |||
} | |||
if (!PQCLEAN_FALCON1024_AVX2_complete_private(G, f, g, F, 10, tmp.b)) { | |||
return -1; | |||
} | |||
/* | |||
* Create a random nonce (40 bytes). | |||
*/ | |||
randombytes(nonce, NONCELEN); | |||
/* | |||
* Hash message nonce + message into a vector. | |||
*/ | |||
inner_shake256_init(&sc); | |||
inner_shake256_inject(&sc, nonce, NONCELEN); | |||
inner_shake256_inject(&sc, m, mlen); | |||
inner_shake256_flip(&sc); | |||
PQCLEAN_FALCON1024_AVX2_hash_to_point_vartime(&sc, r.hm, 10); | |||
inner_shake256_ctx_release(&sc); | |||
/* | |||
* Initialize a RNG. | |||
*/ | |||
randombytes(seed, sizeof seed); | |||
inner_shake256_init(&sc); | |||
inner_shake256_inject(&sc, seed, sizeof seed); | |||
inner_shake256_flip(&sc); | |||
/* | |||
* Compute and return the signature. This loops until a signature | |||
* value is found that fits in the provided buffer. | |||
*/ | |||
for (;;) { | |||
PQCLEAN_FALCON1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b); | |||
v = PQCLEAN_FALCON1024_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 10); | |||
if (v != 0) { | |||
inner_shake256_ctx_release(&sc); | |||
*sigbuflen = v; | |||
return 0; | |||
} | |||
} | |||
} | |||
/* | |||
* Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[] | |||
* (of size sigbuflen) contains the signature value, not including the | |||
* header byte or nonce. Return value is 0 on success, -1 on error. | |||
*/ | |||
static int | |||
do_verify( | |||
const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
union { | |||
uint8_t b[2 * 1024]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
uint16_t h[1024], hm[1024]; | |||
int16_t sig[1024]; | |||
inner_shake256_context sc; | |||
/* | |||
* Decode public key. | |||
*/ | |||
if (pk[0] != 0x00 + 10) { | |||
return -1; | |||
} | |||
if (PQCLEAN_FALCON1024_AVX2_modq_decode(h, 10, | |||
pk + 1, PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) | |||
!= PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) { | |||
return -1; | |||
} | |||
PQCLEAN_FALCON1024_AVX2_to_ntt_monty(h, 10); | |||
/* | |||
* Decode signature. | |||
*/ | |||
if (sigbuflen == 0) { | |||
return -1; | |||
} | |||
if (PQCLEAN_FALCON1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen) != sigbuflen) { | |||
return -1; | |||
} | |||
/* | |||
* Hash nonce + message into a vector. | |||
*/ | |||
inner_shake256_init(&sc); | |||
inner_shake256_inject(&sc, nonce, NONCELEN); | |||
inner_shake256_inject(&sc, m, mlen); | |||
inner_shake256_flip(&sc); | |||
PQCLEAN_FALCON1024_AVX2_hash_to_point_ct(&sc, hm, 10, tmp.b); | |||
inner_shake256_ctx_release(&sc); | |||
/* | |||
* Verify signature. | |||
*/ | |||
if (!PQCLEAN_FALCON1024_AVX2_verify_raw(hm, sig, h, 10, tmp.b)) { | |||
return -1; | |||
} | |||
return 0; | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
/* | |||
* The PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES constant is used for | |||
* the signed message object (as produced by PQCLEAN_FALCON1024_AVX2_crypto_sign()) | |||
* and includes a two-byte length value, so we take care here | |||
* to only generate signatures that are two bytes shorter than | |||
* the maximum. This is done to ensure that PQCLEAN_FALCON1024_AVX2_crypto_sign() | |||
* and PQCLEAN_FALCON1024_AVX2_crypto_sign_signature() produce the exact same signature | |||
* value, if used on the same message, with the same private key, | |||
* and using the same output from randombytes() (this is for | |||
* reproducibility of tests). | |||
*/ | |||
size_t vlen; | |||
vlen = PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES - NONCELEN - 3; | |||
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) { | |||
return -1; | |||
} | |||
sig[0] = 0x30 + 10; | |||
*siglen = 1 + NONCELEN + vlen; | |||
return 0; | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
if (siglen < 1 + NONCELEN) { | |||
return -1; | |||
} | |||
if (sig[0] != 0x30 + 10) { | |||
return -1; | |||
} | |||
return do_verify(sig + 1, | |||
sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk); | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
uint8_t *pm, *sigbuf; | |||
size_t sigbuflen; | |||
/* | |||
* Move the message to its final location; this is a memmove() so | |||
* it handles overlaps properly. | |||
*/ | |||
memmove(sm + 2 + NONCELEN, m, mlen); | |||
pm = sm + 2 + NONCELEN; | |||
sigbuf = pm + 1 + mlen; | |||
sigbuflen = PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES - NONCELEN - 3; | |||
if (do_sign(sm + 2, sigbuf, &sigbuflen, pm, mlen, sk) < 0) { | |||
return -1; | |||
} | |||
pm[mlen] = 0x20 + 10; | |||
sigbuflen ++; | |||
sm[0] = (uint8_t)(sigbuflen >> 8); | |||
sm[1] = (uint8_t)sigbuflen; | |||
*smlen = mlen + 2 + NONCELEN + sigbuflen; | |||
return 0; | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk) { | |||
const uint8_t *sigbuf; | |||
size_t pmlen, sigbuflen; | |||
if (smlen < 3 + NONCELEN) { | |||
return -1; | |||
} | |||
sigbuflen = ((size_t)sm[0] << 8) | (size_t)sm[1]; | |||
if (sigbuflen < 2 || sigbuflen > (smlen - NONCELEN - 2)) { | |||
return -1; | |||
} | |||
sigbuflen --; | |||
pmlen = smlen - NONCELEN - 3 - sigbuflen; | |||
if (sm[2 + NONCELEN + pmlen] != 0x20 + 10) { | |||
return -1; | |||
} | |||
sigbuf = sm + 2 + NONCELEN + pmlen + 1; | |||
/* | |||
* The 2-byte length header and the one-byte signature header | |||
* have been verified. Nonce is at sm+2, followed by the message | |||
* itself. Message length is in pmlen. sigbuf/sigbuflen point to | |||
* the signature value (excluding the header byte). | |||
*/ | |||
if (do_verify(sm + 2, sigbuf, sigbuflen, | |||
sm + 2 + NONCELEN, pmlen, pk) < 0) { | |||
return -1; | |||
} | |||
/* | |||
* Signature is correct, we just have to copy/move the message | |||
* to its final destination. The memmove() properly handles | |||
* overlaps. | |||
*/ | |||
memmove(m, sm + 2 + NONCELEN, pmlen); | |||
*mlen = pmlen; | |||
return 0; | |||
} |
@@ -0,0 +1,195 @@ | |||
#include "inner.h" | |||
#include <assert.h> | |||
/* | |||
* PRNG and interface to the system RNG. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* | |||
* Include relevant system header files. For Win32, this will also need | |||
* linking with advapi32.dll, which we trigger with an appropriate #pragma. | |||
*/ | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_get_seed(void *seed, size_t len) { | |||
(void)seed; | |||
if (len == 0) { | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON1024_AVX2_prng_init(prng *p, inner_shake256_context *src) { | |||
inner_shake256_extract(src, p->state.d, 56); | |||
PQCLEAN_FALCON1024_AVX2_prng_refill(p); | |||
} | |||
/* | |||
* PRNG based on ChaCha20. | |||
* | |||
* State consists in key (32 bytes) then IV (16 bytes) and block counter | |||
* (8 bytes). Normally, we should not care about local endianness (this | |||
* is for a PRNG), but for the NIST competition we need reproducible KAT | |||
* vectors that work across architectures, so we enforce little-endian | |||
* interpretation where applicable. Moreover, output words are "spread | |||
* out" over the output buffer with the interleaving pattern that is | |||
* naturally obtained from the AVX2 implementation that runs eight | |||
* ChaCha20 instances in parallel. | |||
* | |||
* The block counter is XORed into the first 8 bytes of the IV. | |||
*/ | |||
void | |||
PQCLEAN_FALCON1024_AVX2_prng_refill(prng *p) { | |||
static const uint32_t CW[] = { | |||
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 | |||
}; | |||
uint64_t cc; | |||
size_t u; | |||
int i; | |||
uint32_t *sw; | |||
union { | |||
uint32_t w[16]; | |||
__m256i y[2]; /* for alignment */ | |||
} t; | |||
__m256i state[16], init[16]; | |||
sw = (uint32_t *)p->state.d; | |||
/* | |||
* XOR next counter values into state. | |||
*/ | |||
cc = *(uint64_t *)(p->state.d + 48); | |||
for (u = 0; u < 8; u ++) { | |||
t.w[u] = (uint32_t)(cc + u); | |||
t.w[u + 8] = (uint32_t)((cc + u) >> 32); | |||
} | |||
*(uint64_t *)(p->state.d + 48) = cc + 8; | |||
/* | |||
* Load state. | |||
*/ | |||
for (u = 0; u < 4; u ++) { | |||
state[u] = init[u] = | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)CW[u])); | |||
} | |||
for (u = 0; u < 10; u ++) { | |||
state[u + 4] = init[u + 4] = | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[u])); | |||
} | |||
state[14] = init[14] = _mm256_xor_si256( | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[10])), | |||
_mm256_loadu_si256((__m256i *)&t.w[0])); | |||
state[15] = init[15] = _mm256_xor_si256( | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[11])), | |||
_mm256_loadu_si256((__m256i *)&t.w[8])); | |||
/* | |||
* Do all rounds. | |||
*/ | |||
for (i = 0; i < 10; i ++) { | |||
#define QROUND(a, b, c, d) do { \ | |||
state[a] = _mm256_add_epi32(state[a], state[b]); \ | |||
state[d] = _mm256_xor_si256(state[d], state[a]); \ | |||
state[d] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[d], 16), \ | |||
_mm256_srli_epi32(state[d], 16)); \ | |||
state[c] = _mm256_add_epi32(state[c], state[d]); \ | |||
state[b] = _mm256_xor_si256(state[b], state[c]); \ | |||
state[b] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[b], 12), \ | |||
_mm256_srli_epi32(state[b], 20)); \ | |||
state[a] = _mm256_add_epi32(state[a], state[b]); \ | |||
state[d] = _mm256_xor_si256(state[d], state[a]); \ | |||
state[d] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[d], 8), \ | |||
_mm256_srli_epi32(state[d], 24)); \ | |||
state[c] = _mm256_add_epi32(state[c], state[d]); \ | |||
state[b] = _mm256_xor_si256(state[b], state[c]); \ | |||
state[b] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[b], 7), \ | |||
_mm256_srli_epi32(state[b], 25)); \ | |||
} while (0) | |||
QROUND( 0, 4, 8, 12); | |||
QROUND( 1, 5, 9, 13); | |||
QROUND( 2, 6, 10, 14); | |||
QROUND( 3, 7, 11, 15); | |||
QROUND( 0, 5, 10, 15); | |||
QROUND( 1, 6, 11, 12); | |||
QROUND( 2, 7, 8, 13); | |||
QROUND( 3, 4, 9, 14); | |||
#undef QROUND | |||
} | |||
/* | |||
* Add initial state back and encode the result in the destination | |||
* buffer. We can dump the AVX2 values "as is" because the non-AVX2 | |||
* code uses a compatible order of values. | |||
*/ | |||
for (u = 0; u < 16; u ++) { | |||
_mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], | |||
_mm256_add_epi32(state[u], init[u])); | |||
} | |||
p->ptr = 0; | |||
} | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) { | |||
uint8_t *buf; | |||
buf = dst; | |||
while (len > 0) { | |||
size_t clen; | |||
clen = (sizeof p->buf.d) - p->ptr; | |||
if (clen > len) { | |||
clen = len; | |||
} | |||
memcpy(buf, p->buf.d, clen); | |||
buf += clen; | |||
len -= clen; | |||
p->ptr += clen; | |||
if (p->ptr == sizeof p->buf.d) { | |||
PQCLEAN_FALCON1024_AVX2_prng_refill(p); | |||
} | |||
} | |||
} |
@@ -0,0 +1,853 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon signature verification. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* ===================================================================== */ | |||
/* | |||
* Constants for NTT. | |||
* | |||
* n = 2^logn (2 <= n <= 1024) | |||
* phi = X^n + 1 | |||
* q = 12289 | |||
* q0i = -1/q mod 2^16 | |||
* R = 2^16 mod q | |||
* R2 = 2^32 mod q | |||
*/ | |||
#define Q 12289 | |||
#define Q0I 12287 | |||
#define R 4091 | |||
#define R2 10952 | |||
/* | |||
* Table for NTT, binary case: | |||
* GMb[x] = R*(g^rev(x)) mod q | |||
* where g = 7 (it is a 2048-th primitive root of 1 modulo q) | |||
* and rev() is the bit-reversal function over 10 bits. | |||
*/ | |||
static const uint16_t GMb[] = { | |||
4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759, | |||
1591, 6399, 9477, 5266, 586, 5825, 7538, 9710, | |||
1134, 6407, 1711, 965, 7099, 7674, 3743, 6442, | |||
10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180, | |||
12210, 6240, 997, 117, 4783, 4407, 1549, 7072, | |||
2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042, | |||
12189, 432, 10751, 1237, 7610, 1534, 3983, 7863, | |||
2181, 6308, 8720, 6570, 4843, 1690, 14, 3872, | |||
5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340, | |||
1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045, | |||
3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180, | |||
9277, 6130, 3323, 883, 10469, 489, 1502, 2851, | |||
11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195, | |||
730, 1762, 3854, 2030, 5892, 10922, 9020, 5274, | |||
9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446, | |||
7613, 9386, 834, 7703, 6836, 3403, 5351, 12276, | |||
3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525, | |||
10401, 2749, 7338, 10574, 6040, 943, 9330, 1477, | |||
6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680, | |||
8188, 6902, 3533, 9807, 6090, 727, 10099, 7003, | |||
6945, 1949, 9731, 10559, 6057, 378, 7871, 8763, | |||
8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821, | |||
5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159, | |||
1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188, | |||
737, 3698, 4699, 5753, 9046, 3687, 16, 914, | |||
5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381, | |||
10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357, | |||
6409, 8197, 2962, 6375, 5064, 6634, 5625, 278, | |||
932, 10229, 8927, 7642, 351, 9298, 237, 5858, | |||
7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204, | |||
4602, 1748, 11300, 340, 3711, 4614, 300, 10993, | |||
5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654, | |||
3835, 5553, 1224, 8476, 9237, 3845, 250, 11209, | |||
4225, 6326, 9680, 12254, 4136, 2778, 692, 8808, | |||
6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433, | |||
6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416, | |||
8418, 10824, 11986, 5733, 876, 7030, 2167, 2436, | |||
3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434, | |||
7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328, | |||
8557, 1867, 9454, 2416, 3816, 9076, 686, 5393, | |||
2523, 4339, 6115, 619, 937, 2834, 7775, 3279, | |||
2363, 7488, 6112, 5056, 824, 10204, 11690, 1113, | |||
2727, 9848, 896, 2028, 5075, 2654, 10464, 7884, | |||
12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520, | |||
1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399, | |||
11192, 315, 4511, 1158, 6061, 6751, 11865, 357, | |||
7367, 4550, 983, 8534, 8352, 10126, 7530, 9253, | |||
4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652, | |||
3374, 11477, 1753, 292, 8681, 2806, 10378, 12188, | |||
5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928, | |||
4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650, | |||
7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344, | |||
8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561, | |||
6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114, | |||
7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323, | |||
10438, 9471, 1271, 408, 6911, 3079, 360, 8276, | |||
11535, 9156, 9049, 11539, 850, 8617, 784, 7919, | |||
8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600, | |||
9779, 1012, 721, 2784, 6676, 6552, 5348, 4424, | |||
6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333, | |||
8801, 9661, 7308, 5788, 4910, 909, 11613, 4395, | |||
8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216, | |||
4296, 11918, 695, 4371, 9793, 4884, 2411, 10230, | |||
2650, 841, 3890, 10231, 7248, 8505, 11196, 6688, | |||
4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868, | |||
11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525, | |||
7938, 7982, 11977, 6755, 537, 4562, 1623, 8227, | |||
11453, 7544, 906, 11816, 9548, 10858, 9703, 2815, | |||
11736, 6813, 6979, 819, 8903, 6271, 10843, 348, | |||
7514, 8339, 6439, 694, 852, 5659, 2781, 3716, | |||
11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885, | |||
2978, 7289, 11884, 9123, 9323, 11830, 98, 2526, | |||
2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224, | |||
10871, 8092, 9651, 5989, 7140, 8480, 1670, 159, | |||
10923, 4918, 128, 7312, 725, 9157, 5006, 6393, | |||
3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668, | |||
3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365, | |||
5110, 45, 2400, 1921, 4377, 2720, 1695, 51, | |||
2808, 650, 1896, 9997, 9971, 11980, 8098, 4833, | |||
4135, 4257, 5838, 4765, 10985, 11532, 590, 12198, | |||
482, 12173, 2006, 7064, 10018, 3912, 12016, 10519, | |||
11362, 6954, 2210, 284, 5413, 6601, 3865, 10339, | |||
11188, 6231, 517, 9564, 11281, 3863, 1210, 4604, | |||
8160, 11447, 153, 7204, 5763, 5089, 9248, 12154, | |||
11748, 1354, 6672, 179, 5532, 2646, 5941, 12185, | |||
862, 3158, 477, 7279, 5678, 7914, 4254, 302, | |||
2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824, | |||
10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449, | |||
5159, 1308, 8315, 3404, 1877, 1231, 112, 6398, | |||
11724, 12272, 7286, 1459, 12274, 9896, 3456, 800, | |||
1397, 10678, 103, 7420, 7976, 936, 764, 632, | |||
7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946, | |||
6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139, | |||
4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850, | |||
7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217, | |||
10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711, | |||
2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729, | |||
4997, 7415, 6315, 12044, 4374, 7157, 4844, 211, | |||
8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875, | |||
8192, 986, 7527, 1401, 870, 3615, 8465, 2756, | |||
9770, 2034, 10168, 3264, 6132, 54, 2880, 4763, | |||
11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038, | |||
2567, 708, 893, 6465, 4962, 10024, 2090, 5718, | |||
10743, 780, 4733, 4623, 2134, 2087, 4802, 884, | |||
5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664, | |||
4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791, | |||
6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032, | |||
11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062, | |||
8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348, | |||
4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499, | |||
2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326, | |||
5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830, | |||
11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582, | |||
3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762, | |||
7496, 10383, 755, 1654, 12053, 4952, 10134, 4394, | |||
6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674, | |||
10358, 4901, 7414, 8771, 710, 6764, 8462, 7193, | |||
5371, 7274, 11084, 290, 7864, 6827, 11822, 2509, | |||
6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105, | |||
11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776, | |||
7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277, | |||
9182, 11456, 633, 10046, 11554, 5633, 9587, 2333, | |||
7008, 7084, 5047, 7199, 9865, 8997, 569, 6390, | |||
10845, 9679, 8268, 11472, 4203, 1997, 2, 9331, | |||
162, 6182, 2000, 3649, 9792, 6363, 7557, 6187, | |||
8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067, | |||
5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165, | |||
6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949 | |||
}; | |||
/* | |||
* Table for inverse NTT, binary case: | |||
* iGMb[x] = R*((1/g)^rev(x)) mod q | |||
* Since g = 7, 1/g = 8778 mod 12289. | |||
*/ | |||
static const uint16_t iGMb[] = { | |||
4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329, | |||
2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698, | |||
3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875, | |||
5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155, | |||
8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108, | |||
4426, 8306, 10755, 4679, 11052, 1538, 11857, 100, | |||
8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460, | |||
5217, 10740, 7882, 7506, 12172, 11292, 6049, 79, | |||
13, 6938, 8886, 5453, 4586, 11455, 2903, 4676, | |||
9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110, | |||
7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559, | |||
11094, 2211, 1808, 7319, 48, 9547, 2560, 1228, | |||
9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012, | |||
6109, 2796, 2203, 1652, 711, 7004, 1053, 8973, | |||
5244, 1517, 9322, 11269, 900, 3888, 11133, 10736, | |||
4949, 7616, 9974, 4746, 10270, 126, 2921, 6720, | |||
6635, 6543, 1582, 4868, 42, 673, 2240, 7219, | |||
1296, 11989, 7675, 8578, 11949, 989, 10541, 7687, | |||
7085, 8487, 1004, 10236, 4703, 163, 9143, 4597, | |||
6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357, | |||
12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880, | |||
6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556, | |||
6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103, | |||
11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552, | |||
6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822, | |||
9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609, | |||
3468, 4659, 625, 2700, 7738, 3443, 3060, 3388, | |||
3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344, | |||
5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101, | |||
4609, 8605, 8226, 144, 5656, 8704, 2621, 5424, | |||
10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888, | |||
3764, 39, 8219, 2080, 2502, 1469, 10550, 8709, | |||
5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639, | |||
2059, 9878, 7405, 2496, 7918, 11594, 371, 7993, | |||
3073, 10326, 40, 10004, 9245, 7987, 5603, 4051, | |||
7894, 676, 11380, 7379, 6501, 4981, 2628, 3488, | |||
10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473, | |||
7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510, | |||
6689, 386, 4462, 105, 2076, 10443, 119, 3955, | |||
4370, 11505, 3672, 11439, 750, 3240, 3133, 754, | |||
4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851, | |||
4966, 8181, 2688, 6205, 6814, 926, 2936, 4327, | |||
10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255, | |||
728, 7569, 6056, 10432, 11036, 2452, 2811, 3787, | |||
945, 8998, 1244, 8815, 11017, 11218, 5894, 4325, | |||
4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707, | |||
1361, 9812, 2949, 11265, 10301, 9108, 478, 6489, | |||
101, 1911, 9483, 3608, 11997, 10536, 812, 8915, | |||
637, 8159, 5299, 9128, 3512, 8290, 7068, 7922, | |||
3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922, | |||
11932, 424, 5538, 6228, 11131, 7778, 11974, 1097, | |||
2890, 10027, 2569, 2250, 2352, 821, 2550, 11016, | |||
7769, 136, 617, 3157, 5889, 9219, 6855, 120, | |||
4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562, | |||
11176, 599, 2085, 11465, 7233, 6177, 4801, 9926, | |||
9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766, | |||
6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732, | |||
7961, 1457, 10857, 8069, 832, 1628, 3410, 4900, | |||
10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847, | |||
9853, 10122, 5259, 11413, 6556, 303, 1465, 3871, | |||
4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852, | |||
3856, 928, 4933, 8530, 1871, 2184, 5571, 5879, | |||
3481, 11597, 9511, 8153, 35, 2609, 5963, 8064, | |||
1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454, | |||
2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028, | |||
2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795, | |||
9222, 10837, 280, 8583, 3270, 6753, 2354, 3779, | |||
6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127, | |||
2958, 12287, 10292, 8086, 817, 4021, 2610, 1444, | |||
5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281, | |||
9956, 2702, 6656, 735, 2243, 11656, 833, 3107, | |||
6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278, | |||
3513, 9769, 3025, 779, 9433, 3392, 7437, 668, | |||
10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711, | |||
9780, 467, 5462, 4425, 11999, 1205, 5015, 6918, | |||
5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931, | |||
6615, 1541, 8708, 260, 3385, 4792, 4391, 5697, | |||
7895, 2155, 7337, 236, 10635, 11534, 1906, 4793, | |||
9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556, | |||
707, 1088, 4936, 678, 10245, 18, 5684, 960, | |||
4459, 7957, 226, 2451, 6, 8874, 320, 6298, | |||
8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876, | |||
9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679, | |||
7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378, | |||
5227, 952, 4319, 9810, 4356, 3088, 11118, 840, | |||
6257, 486, 6000, 1342, 10382, 6017, 4798, 5489, | |||
4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037, | |||
1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917, | |||
11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546, | |||
6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722, | |||
2251, 11199, 5356, 7408, 2861, 4003, 9215, 484, | |||
7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519, | |||
9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097, | |||
2414, 6496, 9953, 10554, 808, 2999, 2130, 4286, | |||
12078, 7445, 5132, 7915, 245, 5974, 4874, 7292, | |||
7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022, | |||
9578, 8934, 11074, 9498, 294, 4711, 3391, 1377, | |||
9072, 10189, 4569, 10890, 9909, 6923, 53, 4653, | |||
439, 10253, 7028, 10207, 8343, 1141, 2556, 7601, | |||
8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765, | |||
10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293, | |||
11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892, | |||
11489, 8833, 2393, 15, 10830, 5003, 17, 565, | |||
5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130, | |||
5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020, | |||
2465, 8191, 384, 2642, 2729, 5399, 2175, 9396, | |||
11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427, | |||
104, 6348, 9643, 6757, 12110, 5617, 10935, 541, | |||
135, 3041, 7200, 6526, 5085, 12136, 842, 4129, | |||
7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101, | |||
1950, 8424, 5688, 6876, 12005, 10079, 5335, 927, | |||
1770, 273, 8377, 2271, 5225, 10283, 116, 11807, | |||
91, 11699, 757, 1304, 7524, 6451, 8032, 8154, | |||
7456, 4191, 309, 2318, 2292, 10393, 11639, 9481, | |||
12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179, | |||
3924, 3188, 367, 2077, 336, 5384, 5631, 8596, | |||
4621, 1775, 8866, 451, 6108, 1317, 6246, 8795, | |||
5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366, | |||
12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418, | |||
10065, 4156, 8373, 8644, 10445, 882, 8158, 10173, | |||
9763, 12191, 459, 2966, 3166, 405, 5000, 9311, | |||
6404, 8986, 1551, 8175, 3630, 10766, 9265, 700, | |||
8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775, | |||
11941, 1446, 6018, 3386, 11470, 5310, 5476, 553, | |||
9474, 2586, 1431, 2741, 473, 11383, 4745, 836, | |||
4062, 10666, 7727, 11752, 5534, 312, 4307, 4351, | |||
5764, 8679, 8381, 8187, 5, 7395, 4363, 1152, | |||
5421, 5231, 6473, 436, 7567, 8603, 6229, 8230 | |||
}; | |||
/* | |||
* Reduce a small signed integer modulo q. The source integer MUST | |||
* be between -q/2 and +q/2. | |||
*/ | |||
static inline uint32_t | |||
mq_conv_small(int x) { | |||
/* | |||
* If x < 0, the cast to uint32_t will set the high bit to 1. | |||
*/ | |||
uint32_t y; | |||
y = (uint32_t)x; | |||
y += Q & -(y >> 31); | |||
return y; | |||
} | |||
/* | |||
* Addition modulo q. Operands must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_add(uint32_t x, uint32_t y) { | |||
/* | |||
* We compute x + y - q. If the result is negative, then the | |||
* high bit will be set, and 'd >> 31' will be equal to 1; | |||
* thus '-(d >> 31)' will be an all-one pattern. Otherwise, | |||
* it will be an all-zero pattern. In other words, this | |||
* implements a conditional addition of q. | |||
*/ | |||
uint32_t d; | |||
d = x + y - Q; | |||
d += Q & -(d >> 31); | |||
return d; | |||
} | |||
/* | |||
* Subtraction modulo q. Operands must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_sub(uint32_t x, uint32_t y) { | |||
/* | |||
* As in mq_add(), we use a conditional addition to ensure the | |||
* result is in the 0..q-1 range. | |||
*/ | |||
uint32_t d; | |||
d = x - y; | |||
d += Q & -(d >> 31); | |||
return d; | |||
} | |||
/* | |||
* Division by 2 modulo q. Operand must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_rshift1(uint32_t x) { | |||
x += Q & -(x & 1); | |||
return (x >> 1); | |||
} | |||
/* | |||
* Montgomery multiplication modulo q. If we set R = 2^16 mod q, then | |||
* this function computes: x * y / R mod q | |||
* Operands must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_montymul(uint32_t x, uint32_t y) { | |||
uint32_t z, w; | |||
/* | |||
* We compute x*y + k*q with a value of k chosen so that the 16 | |||
* low bits of the result are 0. We can then shift the value. | |||
* After the shift, result may still be larger than q, but it | |||
* will be lower than 2*q, so a conditional subtraction works. | |||
*/ | |||
z = x * y; | |||
w = ((z * Q0I) & 0xFFFF) * Q; | |||
/* | |||
* When adding z and w, the result will have its low 16 bits | |||
* equal to 0. Since x, y and z are lower than q, the sum will | |||
* be no more than (2^15 - 1) * q + (q - 1)^2, which will | |||
* fit on 29 bits. | |||
*/ | |||
z = (z + w) >> 16; | |||
/* | |||
* After the shift, analysis shows that the value will be less | |||
* than 2q. We do a subtraction then conditional subtraction to | |||
* ensure the result is in the expected range. | |||
*/ | |||
z -= Q; | |||
z += Q & -(z >> 31); | |||
return z; | |||
} | |||
/* | |||
* Montgomery squaring (computes (x^2)/R). | |||
*/ | |||
static inline uint32_t | |||
mq_montysqr(uint32_t x) { | |||
return mq_montymul(x, x); | |||
} | |||
/* | |||
* Divide x by y modulo q = 12289. | |||
*/ | |||
static inline uint32_t | |||
mq_div_12289(uint32_t x, uint32_t y) { | |||
/* | |||
* We invert y by computing y^(q-2) mod q. | |||
* | |||
* We use the following addition chain for exponent e = 12287: | |||
* | |||
* e0 = 1 | |||
* e1 = 2 * e0 = 2 | |||
* e2 = e1 + e0 = 3 | |||
* e3 = e2 + e1 = 5 | |||
* e4 = 2 * e3 = 10 | |||
* e5 = 2 * e4 = 20 | |||
* e6 = 2 * e5 = 40 | |||
* e7 = 2 * e6 = 80 | |||
* e8 = 2 * e7 = 160 | |||
* e9 = e8 + e2 = 163 | |||
* e10 = e9 + e8 = 323 | |||
* e11 = 2 * e10 = 646 | |||
* e12 = 2 * e11 = 1292 | |||
* e13 = e12 + e9 = 1455 | |||
* e14 = 2 * e13 = 2910 | |||
* e15 = 2 * e14 = 5820 | |||
* e16 = e15 + e10 = 6143 | |||
* e17 = 2 * e16 = 12286 | |||
* e18 = e17 + e0 = 12287 | |||
* | |||
* Additions on exponents are converted to Montgomery | |||
* multiplications. We define all intermediate results as so | |||
* many local variables, and let the C compiler work out which | |||
* must be kept around. | |||
*/ | |||
uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; | |||
uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18; | |||
y0 = mq_montymul(y, R2); | |||
y1 = mq_montysqr(y0); | |||
y2 = mq_montymul(y1, y0); | |||
y3 = mq_montymul(y2, y1); | |||
y4 = mq_montysqr(y3); | |||
y5 = mq_montysqr(y4); | |||
y6 = mq_montysqr(y5); | |||
y7 = mq_montysqr(y6); | |||
y8 = mq_montysqr(y7); | |||
y9 = mq_montymul(y8, y2); | |||
y10 = mq_montymul(y9, y8); | |||
y11 = mq_montysqr(y10); | |||
y12 = mq_montysqr(y11); | |||
y13 = mq_montymul(y12, y9); | |||
y14 = mq_montysqr(y13); | |||
y15 = mq_montysqr(y14); | |||
y16 = mq_montymul(y15, y10); | |||
y17 = mq_montysqr(y16); | |||
y18 = mq_montymul(y17, y0); | |||
/* | |||
* Final multiplication with x, which is not in Montgomery | |||
* representation, computes the correct division result. | |||
*/ | |||
return mq_montymul(y18, x); | |||
} | |||
/* | |||
* Compute NTT on a ring element. | |||
*/ | |||
static void | |||
mq_NTT(uint16_t *a, unsigned logn) { | |||
size_t n, t, m; | |||
n = (size_t)1 << logn; | |||
t = n; | |||
for (m = 1; m < n; m <<= 1) { | |||
size_t ht, i, j1; | |||
ht = t >> 1; | |||
for (i = 0, j1 = 0; i < m; i ++, j1 += t) { | |||
size_t j, j2; | |||
uint32_t s; | |||
s = GMb[m + i]; | |||
j2 = j1 + ht; | |||
for (j = j1; j < j2; j ++) { | |||
uint32_t u, v; | |||
u = a[j]; | |||
v = mq_montymul(a[j + ht], s); | |||
a[j] = (uint16_t)mq_add(u, v); | |||
a[j + ht] = (uint16_t)mq_sub(u, v); | |||
} | |||
} | |||
t = ht; | |||
} | |||
} | |||
/* | |||
* Compute the inverse NTT on a ring element, binary case. | |||
*/ | |||
static void | |||
mq_iNTT(uint16_t *a, unsigned logn) { | |||
size_t n, t, m; | |||
uint32_t ni; | |||
n = (size_t)1 << logn; | |||
t = 1; | |||
m = n; | |||
while (m > 1) { | |||
size_t hm, dt, i, j1; | |||
hm = m >> 1; | |||
dt = t << 1; | |||
for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) { | |||
size_t j, j2; | |||
uint32_t s; | |||
j2 = j1 + t; | |||
s = iGMb[hm + i]; | |||
for (j = j1; j < j2; j ++) { | |||
uint32_t u, v, w; | |||
u = a[j]; | |||
v = a[j + t]; | |||
a[j] = (uint16_t)mq_add(u, v); | |||
w = mq_sub(u, v); | |||
a[j + t] = (uint16_t) | |||
mq_montymul(w, s); | |||
} | |||
} | |||
t = dt; | |||
m = hm; | |||
} | |||
/* | |||
* To complete the inverse NTT, we must now divide all values by | |||
* n (the vector size). We thus need the inverse of n, i.e. we | |||
* need to divide 1 by 2 logn times. But we also want it in | |||
* Montgomery representation, i.e. we also want to multiply it | |||
* by R = 2^16. In the common case, this should be a simple right | |||
* shift. The loop below is generic and works also in corner cases; | |||
* its computation time is negligible. | |||
*/ | |||
ni = R; | |||
for (m = n; m > 1; m >>= 1) { | |||
ni = mq_rshift1(ni); | |||
} | |||
for (m = 0; m < n; m ++) { | |||
a[m] = (uint16_t)mq_montymul(a[m], ni); | |||
} | |||
} | |||
/* | |||
* Convert a polynomial (mod q) to Montgomery representation. | |||
*/ | |||
static void | |||
mq_poly_tomonty(uint16_t *f, unsigned logn) { | |||
size_t u, n; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
f[u] = (uint16_t)mq_montymul(f[u], R2); | |||
} | |||
} | |||
/* | |||
* Multiply two polynomials together (NTT representation, and using | |||
* a Montgomery multiplication). Result f*g is written over f. | |||
*/ | |||
static void | |||
mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) { | |||
size_t u, n; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
f[u] = (uint16_t)mq_montymul(f[u], g[u]); | |||
} | |||
} | |||
/* | |||
* Subtract polynomial g from polynomial f. | |||
*/ | |||
static void | |||
mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) { | |||
size_t u, n; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
f[u] = (uint16_t)mq_sub(f[u], g[u]); | |||
} | |||
} | |||
/* ===================================================================== */ | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON1024_AVX2_to_ntt_monty(uint16_t *h, unsigned logn) { | |||
mq_NTT(h, logn); | |||
mq_poly_tomonty(h, logn); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2, | |||
const uint16_t *h, unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
n = (size_t)1 << logn; | |||
tt = (uint16_t *)tmp; | |||
/* | |||
* Reduce s2 elements modulo q ([0..q-1] range). | |||
*/ | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u]; | |||
w += Q & -(w >> 31); | |||
tt[u] = (uint16_t)w; | |||
} | |||
/* | |||
* Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). | |||
*/ | |||
mq_NTT(tt, logn); | |||
mq_poly_montymul_ntt(tt, h, logn); | |||
mq_iNTT(tt, logn); | |||
mq_poly_sub(tt, c0, logn); | |||
/* | |||
* Normalize -s1 elements into the [-q/2..q/2] range. | |||
*/ | |||
for (u = 0; u < n; u ++) { | |||
int32_t w; | |||
w = (int32_t)tt[u]; | |||
w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31)); | |||
((int16_t *)tt)[u] = (int16_t)w; | |||
} | |||
/* | |||
* Signature is valid if and only if the aggregate (-s1,s2) vector | |||
* is short enough. | |||
*/ | |||
return PQCLEAN_FALCON1024_AVX2_is_short((int16_t *)tt, s2, logn); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_compute_public(uint16_t *h, | |||
const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
n = (size_t)1 << logn; | |||
tt = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
tt[u] = (uint16_t)mq_conv_small(f[u]); | |||
h[u] = (uint16_t)mq_conv_small(g[u]); | |||
} | |||
mq_NTT(h, logn); | |||
mq_NTT(tt, logn); | |||
for (u = 0; u < n; u ++) { | |||
if (tt[u] == 0) { | |||
return 0; | |||
} | |||
h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); | |||
} | |||
mq_iNTT(h, logn); | |||
return 1; | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_complete_private(int8_t *G, | |||
const int8_t *f, const int8_t *g, const int8_t *F, | |||
unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *t1, *t2; | |||
n = (size_t)1 << logn; | |||
t1 = (uint16_t *)tmp; | |||
t2 = t1 + n; | |||
for (u = 0; u < n; u ++) { | |||
t1[u] = (uint16_t)mq_conv_small(g[u]); | |||
t2[u] = (uint16_t)mq_conv_small(F[u]); | |||
} | |||
mq_NTT(t1, logn); | |||
mq_NTT(t2, logn); | |||
mq_poly_tomonty(t1, logn); | |||
mq_poly_montymul_ntt(t1, t2, logn); | |||
for (u = 0; u < n; u ++) { | |||
t2[u] = (uint16_t)mq_conv_small(f[u]); | |||
} | |||
mq_NTT(t2, logn); | |||
for (u = 0; u < n; u ++) { | |||
if (t2[u] == 0) { | |||
return 0; | |||
} | |||
t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]); | |||
} | |||
mq_iNTT(t1, logn); | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
int32_t gi; | |||
w = t1[u]; | |||
w -= (Q & ~ -((w - (Q >> 1)) >> 31)); | |||
gi = *(int32_t *)&w; | |||
if (gi < -127 || gi > +127) { | |||
return 0; | |||
} | |||
G[u] = (int8_t)gi; | |||
} | |||
return 1; | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_is_invertible( | |||
const int16_t *s2, unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
uint32_t r; | |||
n = (size_t)1 << logn; | |||
tt = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u]; | |||
w += Q & -(w >> 31); | |||
tt[u] = (uint16_t)w; | |||
} | |||
mq_NTT(tt, logn); | |||
r = 0; | |||
for (u = 0; u < n; u ++) { | |||
r |= (uint32_t)(tt[u] - 1); | |||
} | |||
return (int)(1u - (r >> 31)); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_verify_recover(uint16_t *h, | |||
const uint16_t *c0, const int16_t *s1, const int16_t *s2, | |||
unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
uint32_t r; | |||
n = (size_t)1 << logn; | |||
/* | |||
* Reduce elements of s1 and s2 modulo q; then write s2 into tt[] | |||
* and c0 - s1 into h[]. | |||
*/ | |||
tt = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u]; | |||
w += Q & -(w >> 31); | |||
tt[u] = (uint16_t)w; | |||
w = (uint32_t)s1[u]; | |||
w += Q & -(w >> 31); | |||
w = mq_sub(c0[u], w); | |||
h[u] = (uint16_t)w; | |||
} | |||
/* | |||
* Compute h = (c0 - s1) / s2. If one of the coefficients of s2 | |||
* is zero (in NTT representation) then the operation fails. We | |||
* keep that information into a flag so that we do not deviate | |||
* from strict constant-time processing; if all coefficients of | |||
* s2 are non-zero, then the high bit of r will be zero. | |||
*/ | |||
mq_NTT(tt, logn); | |||
mq_NTT(h, logn); | |||
r = 0; | |||
for (u = 0; u < n; u ++) { | |||
r |= (uint32_t)(tt[u] - 1); | |||
h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); | |||
} | |||
mq_iNTT(h, logn); | |||
/* | |||
* Signature is acceptable if and only if it is short enough, | |||
* and s2 was invertible mod phi mod q. The caller must still | |||
* check that the rebuilt public key matches the expected | |||
* value (e.g. through a hash). | |||
*/ | |||
r = ~r & (uint32_t) - PQCLEAN_FALCON1024_AVX2_is_short(s1, s2, logn); | |||
return (int)(r >> 31); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) { | |||
uint16_t *s2; | |||
size_t u, n; | |||
uint32_t r; | |||
n = (size_t)1 << logn; | |||
s2 = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)sig[u]; | |||
w += Q & -(w >> 31); | |||
s2[u] = (uint16_t)w; | |||
} | |||
mq_NTT(s2, logn); | |||
r = 0; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u] - 1u; | |||
r += (w >> 31); | |||
} | |||
return (int)r; | |||
} |
@@ -1,3 +1,4 @@ | |||
\ | |||
MIT License | |||
Copyright (c) 2017-2019 Falcon Project | |||
@@ -20,3 +21,4 @@ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
@@ -1,10 +1,10 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libfalcon-1024_clean.a | |||
LIB=libfalcon1024_clean.a | |||
SOURCES = codec.c common.c fft.c fpr.c keygen.c pqclean.c rng.c sign.c vrfy.c | |||
OBJECTS = codec.o common.o fft.o fpr.o keygen.o pqclean.o rng.o sign.o vrfy.o | |||
HEADERS = api.h fpr.h inner.h | |||
SOURCES = codec.c common.c fft.c fpr.c keygen.c pqclean.c rng.c sign.c vrfy.c | |||
OBJECTS = codec.o common.o fft.o fpr.o keygen.o pqclean.o rng.o sign.o vrfy.o | |||
HEADERS = api.h fpr.h inner.h | |||
CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -1,8 +1,8 @@ | |||
# This Makefile can be used with Microsoft Visual Studio's nmake using the command: | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libfalcon-1024_clean.lib | |||
OBJECTS=codec.obj common.obj fft.obj fpr.obj keygen.obj pqclean.obj rng.obj sign.obj vrfy.obj | |||
LIBRARY=libfalcon1024_clean.lib | |||
OBJECTS=codec.obj common.obj fft.obj fpr.obj keygen.obj pqclean.obj rng.obj sign.obj vrfy.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
@@ -16,7 +16,7 @@ all: $(LIBRARY) | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Encoding/decoding of keys and signatures. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* see inner.h */ | |||
size_t | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Support functions for signatures (hash-to-point, norm). | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* see inner.h */ | |||
void | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* FFT code. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* | |||
* Rules for complex number macros: | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Floating-point operations. | |||
* | |||
@@ -32,7 +34,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* | |||
@@ -1631,4 +1632,3 @@ const fpr fpr_p2_tab[] = { | |||
4571153621781053440U, | |||
4566650022153682944U | |||
}; | |||
@@ -1,3 +1,6 @@ | |||
#ifndef PQCLEAN_FALCON1024_CLEAN_FPR_H | |||
#define PQCLEAN_FALCON1024_CLEAN_FPR_H | |||
/* | |||
* Floating-point operations. | |||
* | |||
@@ -467,4 +470,4 @@ extern const fpr fpr_gm_tab[]; | |||
extern const fpr fpr_p2_tab[]; | |||
/* ====================================================================== */ | |||
#endif |
@@ -1,5 +1,6 @@ | |||
#ifndef FALCON_INNER_H__ | |||
#define FALCON_INNER_H__ | |||
#ifndef PQCLEAN_FALCON1024_CLEAN_INNER_H | |||
#define PQCLEAN_FALCON1024_CLEAN_INNER_H | |||
/* | |||
* Internal functions for Falcon. This is not the API intended to be | |||
@@ -72,8 +73,8 @@ | |||
* proper, or integer-based emulation is used, the set_fpu_cw() | |||
* function does nothing, so it can be called systematically. | |||
*/ | |||
#include "fips202.h" | |||
#include "fpr.h" | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
@@ -115,7 +116,6 @@ set_fpu_cw(unsigned x) { | |||
*/ | |||
#include "fips202.h" | |||
#define inner_shake256_context shake256incctx | |||
#define inner_shake256_init(sc) shake256_inc_init(sc) | |||
@@ -438,7 +438,6 @@ int PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h, | |||
* fpr fpr_mtwo63m1 -(2^63-1) | |||
* fpr fpr_ptwo63 2^63 | |||
*/ | |||
#include "fpr.h" | |||
/* ==================================================================== */ | |||
/* | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon key pair generation. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
#define MKN(logn) ((size_t)1 << (logn)) | |||
@@ -2207,7 +2208,6 @@ get_rng_u64(inner_shake256_context *rng) { | |||
| ((uint64_t)tmp[7] << 56); | |||
} | |||
/* | |||
* Table below incarnates a discrete Gaussian distribution: | |||
* D(x) = exp(-(x^2)/(2*sigma^2)) | |||
@@ -1,16 +1,16 @@ | |||
#include "api.h" | |||
#include "inner.h" | |||
#include "randombytes.h" | |||
#include <stddef.h> | |||
#include <string.h> | |||
/* | |||
* Wrapper for implementing the PQClean API. | |||
*/ | |||
#include <stddef.h> | |||
#include <string.h> | |||
#include "api.h" | |||
#include "inner.h" | |||
#define NONCELEN 40 | |||
#include "randombytes.h" | |||
#define SEEDLEN 48 | |||
/* | |||
* Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024) | |||
@@ -41,19 +41,19 @@ | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair( | |||
uint8_t *pk, uint8_t *sk) { | |||
PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { | |||
union { | |||
uint8_t b[FALCON_KEYGEN_TEMP_10]; | |||
uint8_t b[28 * 1024]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
int8_t f[1024], g[1024], F[1024]; | |||
int8_t f[1024], g[1024], F[1024], G[1024]; | |||
uint16_t h[1024]; | |||
unsigned char seed[48]; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context rng; | |||
size_t u, v; | |||
/* | |||
* Generate key pair. | |||
*/ | |||
@@ -61,7 +61,7 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair( | |||
inner_shake256_init(&rng); | |||
inner_shake256_inject(&rng, seed, sizeof seed); | |||
inner_shake256_flip(&rng); | |||
PQCLEAN_FALCON1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b); | |||
PQCLEAN_FALCON1024_CLEAN_keygen(&rng, f, g, F, G, h, 10, tmp.b); | |||
inner_shake256_ctx_release(&rng); | |||
/* | |||
@@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, | |||
int16_t sig[1024]; | |||
uint16_t hm[1024]; | |||
} r; | |||
unsigned char seed[48]; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context sc; | |||
size_t u, v; | |||
@@ -174,6 +174,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, | |||
return -1; | |||
} | |||
/* | |||
* Create a random nonce (40 bytes). | |||
*/ | |||
@@ -186,7 +187,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, | |||
inner_shake256_inject(&sc, nonce, NONCELEN); | |||
inner_shake256_inject(&sc, m, mlen); | |||
inner_shake256_flip(&sc); | |||
PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b); | |||
PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(&sc, r.hm, 10); | |||
inner_shake256_ctx_release(&sc); | |||
/* | |||
@@ -279,11 +280,11 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_signature( | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
/* | |||
* The PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES constant is used for | |||
* the signed message object (as produced by crypto_sign()) | |||
* the signed message object (as produced by PQCLEAN_FALCON1024_CLEAN_crypto_sign()) | |||
* and includes a two-byte length value, so we take care here | |||
* to only generate signatures that are two bytes shorter than | |||
* the maximum. This is done to ensure that crypto_sign() | |||
* and crypto_sign_signature() produce the exact same signature | |||
* the maximum. This is done to ensure that PQCLEAN_FALCON1024_CLEAN_crypto_sign() | |||
* and PQCLEAN_FALCON1024_CLEAN_crypto_sign_signature() produce the exact same signature | |||
* value, if used on the same message, with the same private key, | |||
* and using the same output from randombytes() (this is for | |||
* reproducibility of tests). | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
#include <assert.h> | |||
/* | |||
* PRNG and interface to the system RNG. | |||
* | |||
@@ -29,10 +31,22 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include <assert.h> | |||
#include "inner.h" | |||
/* | |||
* Include relevant system header files. For Win32, this will also need | |||
* linking with advapi32.dll, which we trigger with an appropriate #pragma. | |||
*/ | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON1024_CLEAN_get_seed(void *seed, size_t len) { | |||
(void)seed; | |||
if (len == 0) { | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/* see inner.h */ | |||
void | |||
@@ -46,9 +60,6 @@ PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) { | |||
uint64_t th, tl; | |||
int i; | |||
uint32_t *d32 = (uint32_t *) p->state.d; | |||
uint64_t *d64 = (uint64_t *) p->state.d; | |||
inner_shake256_extract(src, tmp, 56); | |||
for (i = 0; i < 14; i ++) { | |||
uint32_t w; | |||
@@ -57,11 +68,11 @@ PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) { | |||
| ((uint32_t)tmp[(i << 2) + 1] << 8) | |||
| ((uint32_t)tmp[(i << 2) + 2] << 16) | |||
| ((uint32_t)tmp[(i << 2) + 3] << 24); | |||
d32[i] = w; | |||
*(uint32_t *)(p->state.d + (i << 2)) = w; | |||
} | |||
tl = d32[48 / sizeof(uint32_t)]; | |||
th = d32[52 / sizeof(uint32_t)]; | |||
d64[48 / sizeof(uint64_t)] = tl + (th << 32); | |||
tl = *(uint32_t *)(p->state.d + 48); | |||
th = *(uint32_t *)(p->state.d + 52); | |||
*(uint64_t *)(p->state.d + 48) = tl + (th << 32); | |||
PQCLEAN_FALCON1024_CLEAN_prng_refill(p); | |||
} | |||
@@ -88,14 +99,12 @@ PQCLEAN_FALCON1024_CLEAN_prng_refill(prng *p) { | |||
uint64_t cc; | |||
size_t u; | |||
uint32_t *d32 = (uint32_t *) p->state.d; | |||
uint64_t *d64 = (uint64_t *) p->state.d; | |||
/* | |||
* State uses local endianness. Only the output bytes must be | |||
* converted to little endian (if used on a big-endian machine). | |||
*/ | |||
cc = d64[48 / sizeof(uint64_t)]; | |||
cc = *(uint64_t *)(p->state.d + 48); | |||
for (u = 0; u < 8; u ++) { | |||
uint32_t state[16]; | |||
size_t v; | |||
@@ -139,10 +148,12 @@ PQCLEAN_FALCON1024_CLEAN_prng_refill(prng *p) { | |||
state[v] += CW[v]; | |||
} | |||
for (v = 4; v < 14; v ++) { | |||
state[v] += d32[v - 4]; | |||
state[v] += ((uint32_t *)p->state.d)[v - 4]; | |||
} | |||
state[14] += d32[10] ^ (uint32_t)cc; | |||
state[15] += d32[11] ^ (uint32_t)(cc >> 32); | |||
state[14] += ((uint32_t *)p->state.d)[10] | |||
^ (uint32_t)cc; | |||
state[15] += ((uint32_t *)p->state.d)[11] | |||
^ (uint32_t)(cc >> 32); | |||
cc ++; | |||
/* | |||
@@ -160,7 +171,7 @@ PQCLEAN_FALCON1024_CLEAN_prng_refill(prng *p) { | |||
(uint8_t)(state[v] >> 24); | |||
} | |||
} | |||
d64[48 / sizeof(uint64_t)] = cc; | |||
*(uint64_t *)(p->state.d + 48) = cc; | |||
p->ptr = 0; | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon signature generation. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* =================================================================== */ | |||
@@ -1081,8 +1082,8 @@ BerExp(prng *p, fpr x, fpr ccs) { | |||
int | |||
PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) { | |||
sampler_context *spc; | |||
int s; | |||
fpr r, dss, ccs; | |||
int s, z0, z, b; | |||
fpr r, dss, ccs, x; | |||
spc = ctx; | |||
@@ -1107,9 +1108,6 @@ PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) { | |||
* We now need to sample on center r. | |||
*/ | |||
for (;;) { | |||
int z0, z, b; | |||
fpr x; | |||
/* | |||
* Sample z for a Gaussian distribution. Then get a | |||
* random bit b to turn the sampling into a bimodal | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon signature verification. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* ===================================================================== */ | |||
/* | |||
@@ -20,4 +20,13 @@ auxiliary-submitters: | |||
- Zhenfei Zhang | |||
implementations: | |||
- name: clean | |||
version: 20190920 | |||
version: supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/6f6f4227/falcon | |||
- name: avx2 | |||
version: supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/6f6f4227/falcon | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 |
@@ -0,0 +1,24 @@ | |||
\ | |||
MIT License | |||
Copyright (c) 2017-2019 Falcon Project | |||
Permission is hereby granted, free of charge, to any person obtaining | |||
a copy of this software and associated documentation files (the | |||
"Software"), to deal in the Software without restriction, including | |||
without limitation the rights to use, copy, modify, merge, publish, | |||
distribute, sublicense, and/or sell copies of the Software, and to | |||
permit persons to whom the Software is furnished to do so, subject to | |||
the following conditions: | |||
The above copyright notice and this permission notice shall be | |||
included in all copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
@@ -0,0 +1,24 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libfalcon512_avx2.a | |||
SOURCES = codec.c common.c fft.c fpr.c keygen.c pqclean.c rng.c sign.c vrfy.c | |||
OBJECTS = codec.o common.o fft.o fpr.o keygen.o pqclean.o rng.o sign.o vrfy.o | |||
HEADERS = api.h fpr.h inner.h | |||
CFLAGS=-O3 -Wconversion -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,80 @@ | |||
#ifndef PQCLEAN_FALCON512_AVX2_API_H | |||
#define PQCLEAN_FALCON512_AVX2_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES 1281 | |||
#define PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES 897 | |||
#define PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES 690 | |||
#define PQCLEAN_FALCON512_AVX2_CRYPTO_ALGNAME "Falcon-512" | |||
/* | |||
* Generate a new key pair. Public key goes into pk[], private key in sk[]. | |||
* Key sizes are exact (in bytes): | |||
* public (pk): PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES | |||
* private (sk): PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_crypto_sign_keypair( | |||
uint8_t *pk, uint8_t *sk); | |||
/* | |||
* Compute a signature on a provided message (m, mlen), with a given | |||
* private key (sk). Signature is written in sig[], with length written | |||
* into *siglen. Signature length is variable; maximum signature length | |||
* (in bytes) is PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES. | |||
* | |||
* sig[], m[] and sk[] may overlap each other arbitrarily. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
/* | |||
* Verify a signature (sig, siglen) on a message (m, mlen) with a given | |||
* public key (pk). | |||
* | |||
* sig[], m[] and pk[] may overlap each other arbitrarily. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
/* | |||
* Compute a signature on a message and pack the signature and message | |||
* into a single object, written into sm[]. The length of that output is | |||
* written in *smlen; that length may be larger than the message length | |||
* (mlen) by up to PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES. | |||
* | |||
* sm[] and m[] may overlap each other arbitrarily; however, sm[] shall | |||
* not overlap with sk[]. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
/* | |||
* Open a signed message object (sm, smlen) and verify the signature; | |||
* on success, the message itself is written into m[] and its length | |||
* into *mlen. The message is shorter than the signed message object, | |||
* but the size difference depends on the signature value; the difference | |||
* may range up to PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES. | |||
* | |||
* m[], sm[] and pk[] may overlap each other arbitrarily. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,555 @@ | |||
#include "inner.h" | |||
/* | |||
* Encoding/decoding of keys and signatures. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_modq_encode( | |||
void *out, size_t max_out_len, | |||
const uint16_t *x, unsigned logn) { | |||
size_t n, out_len, u; | |||
uint8_t *buf; | |||
uint32_t acc; | |||
int acc_len; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] >= 12289) { | |||
return 0; | |||
} | |||
} | |||
out_len = ((n * 14) + 7) >> 3; | |||
if (out == NULL) { | |||
return out_len; | |||
} | |||
if (out_len > max_out_len) { | |||
return 0; | |||
} | |||
buf = out; | |||
acc = 0; | |||
acc_len = 0; | |||
for (u = 0; u < n; u ++) { | |||
acc = (acc << 14) | x[u]; | |||
acc_len += 14; | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
*buf ++ = (uint8_t)(acc >> acc_len); | |||
} | |||
} | |||
if (acc_len > 0) { | |||
*buf = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
return out_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_modq_decode( | |||
uint16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len) { | |||
size_t n, in_len, u; | |||
const uint8_t *buf; | |||
uint32_t acc; | |||
int acc_len; | |||
n = (size_t)1 << logn; | |||
in_len = ((n * 14) + 7) >> 3; | |||
if (in_len > max_in_len) { | |||
return 0; | |||
} | |||
buf = in; | |||
acc = 0; | |||
acc_len = 0; | |||
u = 0; | |||
while (u < n) { | |||
acc = (acc << 8) | (*buf ++); | |||
acc_len += 8; | |||
if (acc_len >= 14) { | |||
unsigned w; | |||
acc_len -= 14; | |||
w = (acc >> acc_len) & 0x3FFF; | |||
if (w >= 12289) { | |||
return 0; | |||
} | |||
x[u ++] = (uint16_t)w; | |||
} | |||
} | |||
if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { | |||
return 0; | |||
} | |||
return in_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_trim_i16_encode( | |||
void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn, unsigned bits) { | |||
size_t n, u, out_len; | |||
int minv, maxv; | |||
uint8_t *buf; | |||
uint32_t acc, mask; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
maxv = (1 << (bits - 1)) - 1; | |||
minv = -maxv; | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] < minv || x[u] > maxv) { | |||
return 0; | |||
} | |||
} | |||
out_len = ((n * bits) + 7) >> 3; | |||
if (out == NULL) { | |||
return out_len; | |||
} | |||
if (out_len > max_out_len) { | |||
return 0; | |||
} | |||
buf = out; | |||
acc = 0; | |||
acc_len = 0; | |||
mask = ((uint32_t)1 << bits) - 1; | |||
for (u = 0; u < n; u ++) { | |||
acc = (acc << bits) | ((uint16_t)x[u] & mask); | |||
acc_len += bits; | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
*buf ++ = (uint8_t)(acc >> acc_len); | |||
} | |||
} | |||
if (acc_len > 0) { | |||
*buf ++ = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
return out_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_trim_i16_decode( | |||
int16_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len) { | |||
size_t n, in_len; | |||
const uint8_t *buf; | |||
size_t u; | |||
uint32_t acc, mask1, mask2; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
in_len = ((n * bits) + 7) >> 3; | |||
if (in_len > max_in_len) { | |||
return 0; | |||
} | |||
buf = in; | |||
u = 0; | |||
acc = 0; | |||
acc_len = 0; | |||
mask1 = ((uint32_t)1 << bits) - 1; | |||
mask2 = (uint32_t)1 << (bits - 1); | |||
while (u < n) { | |||
acc = (acc << 8) | *buf ++; | |||
acc_len += 8; | |||
while (acc_len >= bits && u < n) { | |||
uint32_t w; | |||
acc_len -= bits; | |||
w = (acc >> acc_len) & mask1; | |||
w |= -(w & mask2); | |||
if (w == -mask2) { | |||
/* | |||
* The -2^(bits-1) value is forbidden. | |||
*/ | |||
return 0; | |||
} | |||
w |= -(w & mask2); | |||
x[u ++] = (int16_t) * (int32_t *)&w; | |||
} | |||
} | |||
if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { | |||
/* | |||
* Extra bits in the last byte must be zero. | |||
*/ | |||
return 0; | |||
} | |||
return in_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_trim_i8_encode( | |||
void *out, size_t max_out_len, | |||
const int8_t *x, unsigned logn, unsigned bits) { | |||
size_t n, u, out_len; | |||
int minv, maxv; | |||
uint8_t *buf; | |||
uint32_t acc, mask; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
maxv = (1 << (bits - 1)) - 1; | |||
minv = -maxv; | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] < minv || x[u] > maxv) { | |||
return 0; | |||
} | |||
} | |||
out_len = ((n * bits) + 7) >> 3; | |||
if (out == NULL) { | |||
return out_len; | |||
} | |||
if (out_len > max_out_len) { | |||
return 0; | |||
} | |||
buf = out; | |||
acc = 0; | |||
acc_len = 0; | |||
mask = ((uint32_t)1 << bits) - 1; | |||
for (u = 0; u < n; u ++) { | |||
acc = (acc << bits) | ((uint8_t)x[u] & mask); | |||
acc_len += bits; | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
*buf ++ = (uint8_t)(acc >> acc_len); | |||
} | |||
} | |||
if (acc_len > 0) { | |||
*buf ++ = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
return out_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_trim_i8_decode( | |||
int8_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len) { | |||
size_t n, in_len; | |||
const uint8_t *buf; | |||
size_t u; | |||
uint32_t acc, mask1, mask2; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
in_len = ((n * bits) + 7) >> 3; | |||
if (in_len > max_in_len) { | |||
return 0; | |||
} | |||
buf = in; | |||
u = 0; | |||
acc = 0; | |||
acc_len = 0; | |||
mask1 = ((uint32_t)1 << bits) - 1; | |||
mask2 = (uint32_t)1 << (bits - 1); | |||
while (u < n) { | |||
acc = (acc << 8) | *buf ++; | |||
acc_len += 8; | |||
while (acc_len >= bits && u < n) { | |||
uint32_t w; | |||
acc_len -= bits; | |||
w = (acc >> acc_len) & mask1; | |||
w |= -(w & mask2); | |||
if (w == -mask2) { | |||
/* | |||
* The -2^(bits-1) value is forbidden. | |||
*/ | |||
return 0; | |||
} | |||
x[u ++] = (int8_t) * (int32_t *)&w; | |||
} | |||
} | |||
if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { | |||
/* | |||
* Extra bits in the last byte must be zero. | |||
*/ | |||
return 0; | |||
} | |||
return in_len; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_comp_encode( | |||
void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn) { | |||
uint8_t *buf; | |||
size_t n, u, v; | |||
uint32_t acc; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
buf = out; | |||
/* | |||
* Make sure that all values are within the -2047..+2047 range. | |||
*/ | |||
for (u = 0; u < n; u ++) { | |||
if (x[u] < -2047 || x[u] > +2047) { | |||
return 0; | |||
} | |||
} | |||
acc = 0; | |||
acc_len = 0; | |||
v = 0; | |||
for (u = 0; u < n; u ++) { | |||
int t; | |||
unsigned w; | |||
/* | |||
* Get sign and absolute value of next integer; push the | |||
* sign bit. | |||
*/ | |||
acc <<= 1; | |||
t = x[u]; | |||
if (t < 0) { | |||
t = -t; | |||
acc |= 1; | |||
} | |||
w = (unsigned)t; | |||
/* | |||
* Push the low 7 bits of the absolute value. | |||
*/ | |||
acc <<= 7; | |||
acc |= w & 127u; | |||
w >>= 7; | |||
/* | |||
* We pushed exactly 8 bits. | |||
*/ | |||
acc_len += 8; | |||
/* | |||
* Push as many zeros as necessary, then a one. Since the | |||
* absolute value is at most 2047, w can only range up to | |||
* 15 at this point, thus we will add at most 16 bits | |||
* here. With the 8 bits above and possibly up to 7 bits | |||
* from previous iterations, we may go up to 31 bits, which | |||
* will fit in the accumulator, which is an uint32_t. | |||
*/ | |||
acc <<= (w + 1); | |||
acc |= 1; | |||
acc_len += w + 1; | |||
/* | |||
* Produce all full bytes. | |||
*/ | |||
while (acc_len >= 8) { | |||
acc_len -= 8; | |||
if (buf != NULL) { | |||
if (v >= max_out_len) { | |||
return 0; | |||
} | |||
buf[v] = (uint8_t)(acc >> acc_len); | |||
} | |||
v ++; | |||
} | |||
} | |||
/* | |||
* Flush remaining bits (if any). | |||
*/ | |||
if (acc_len > 0) { | |||
if (buf != NULL) { | |||
if (v >= max_out_len) { | |||
return 0; | |||
} | |||
buf[v] = (uint8_t)(acc << (8 - acc_len)); | |||
} | |||
v ++; | |||
} | |||
return v; | |||
} | |||
/* see inner.h */ | |||
size_t | |||
PQCLEAN_FALCON512_AVX2_comp_decode( | |||
int16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len) { | |||
const uint8_t *buf; | |||
size_t n, u, v; | |||
uint32_t acc; | |||
unsigned acc_len; | |||
n = (size_t)1 << logn; | |||
buf = in; | |||
acc = 0; | |||
acc_len = 0; | |||
v = 0; | |||
for (u = 0; u < n; u ++) { | |||
unsigned b, s, m; | |||
/* | |||
* Get next eight bits: sign and low seven bits of the | |||
* absolute value. | |||
*/ | |||
if (v >= max_in_len) { | |||
return 0; | |||
} | |||
acc = (acc << 8) | (uint32_t)buf[v ++]; | |||
b = acc >> acc_len; | |||
s = b & 128; | |||
m = b & 127; | |||
/* | |||
* Get next bits until a 1 is reached. | |||
*/ | |||
for (;;) { | |||
if (acc_len == 0) { | |||
if (v >= max_in_len) { | |||
return 0; | |||
} | |||
acc = (acc << 8) | (uint32_t)buf[v ++]; | |||
acc_len = 8; | |||
} | |||
acc_len --; | |||
if (((acc >> acc_len) & 1) != 0) { | |||
break; | |||
} | |||
m += 128; | |||
if (m > 2047) { | |||
return 0; | |||
} | |||
} | |||
x[u] = (int16_t) m; | |||
if (s) { | |||
x[u] = (int16_t) - x[u]; | |||
} | |||
} | |||
return v; | |||
} | |||
/* | |||
* Key elements and signatures are polynomials with small integer | |||
* coefficients. Here are some statistics gathered over many | |||
* generated key pairs (10000 or more for each degree): | |||
* | |||
* log(n) n max(f,g) std(f,g) max(F,G) std(F,G) | |||
* 1 2 129 56.31 143 60.02 | |||
* 2 4 123 40.93 160 46.52 | |||
* 3 8 97 28.97 159 38.01 | |||
* 4 16 100 21.48 154 32.50 | |||
* 5 32 71 15.41 151 29.36 | |||
* 6 64 59 11.07 138 27.77 | |||
* 7 128 39 7.91 144 27.00 | |||
* 8 256 32 5.63 148 26.61 | |||
* 9 512 22 4.00 137 26.46 | |||
* 10 1024 15 2.84 146 26.41 | |||
* | |||
* We want a compact storage format for private key, and, as part of | |||
* key generation, we are allowed to reject some keys which would | |||
* otherwise be fine (this does not induce any noticeable vulnerability | |||
* as long as we reject only a small proportion of possible keys). | |||
* Hence, we enforce at key generation time maximum values for the | |||
* elements of f, g, F and G, so that their encoding can be expressed | |||
* in fixed-width values. Limits have been chosen so that generated | |||
* keys are almost always within bounds, thus not impacting neither | |||
* security or performance. | |||
* | |||
* IMPORTANT: the code assumes that all coefficients of f, g, F and G | |||
* ultimately fit in the -127..+127 range. Thus, none of the elements | |||
* of max_fg_bits[] and max_FG_bits[] shall be greater than 8. | |||
*/ | |||
const uint8_t PQCLEAN_FALCON512_AVX2_max_fg_bits[] = { | |||
0, /* unused */ | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
7, | |||
7, | |||
6, | |||
6, | |||
5 | |||
}; | |||
const uint8_t PQCLEAN_FALCON512_AVX2_max_FG_bits[] = { | |||
0, /* unused */ | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8, | |||
8 | |||
}; | |||
/* | |||
* When generating a new key pair, we can always reject keys which | |||
* feature an abnormally large coefficient. This can also be done for | |||
* signatures, albeit with some care: in case the signature process is | |||
* used in a derandomized setup (explicitly seeded with the message and | |||
* private key), we have to follow the specification faithfully, and the | |||
* specification only enforces a limit on the L2 norm of the signature | |||
* vector. The limit on the L2 norm implies that the absolute value of | |||
* a coefficient of the signature cannot be more than the following: | |||
* | |||
* log(n) n max sig coeff (theoretical) | |||
* 1 2 412 | |||
* 2 4 583 | |||
* 3 8 824 | |||
* 4 16 1166 | |||
* 5 32 1649 | |||
* 6 64 2332 | |||
* 7 128 3299 | |||
* 8 256 4665 | |||
* 9 512 6598 | |||
* 10 1024 9331 | |||
* | |||
* However, the largest observed signature coefficients during our | |||
* experiments was 1077 (in absolute value), hence we can assume that, | |||
* with overwhelming probability, signature coefficients will fit | |||
* in -2047..2047, i.e. 12 bits. | |||
*/ | |||
const uint8_t PQCLEAN_FALCON512_AVX2_max_sig_bits[] = { | |||
0, /* unused */ | |||
10, | |||
11, | |||
11, | |||
12, | |||
12, | |||
12, | |||
12, | |||
12, | |||
12, | |||
12 | |||
}; |
@@ -0,0 +1,294 @@ | |||
#include "inner.h" | |||
/* | |||
* Support functions for signatures (hash-to-point, norm). | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON512_AVX2_hash_to_point_vartime( | |||
inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn) { | |||
/* | |||
* This is the straightforward per-the-spec implementation. It | |||
* is not constant-time, thus it might reveal information on the | |||
* plaintext (at least, enough to check the plaintext against a | |||
* list of potential plaintexts) in a scenario where the | |||
* attacker does not have access to the signature value or to | |||
* the public key, but knows the nonce (without knowledge of the | |||
* nonce, the hashed output cannot be matched against potential | |||
* plaintexts). | |||
*/ | |||
size_t n; | |||
n = (size_t)1 << logn; | |||
while (n > 0) { | |||
uint8_t buf[2]; | |||
uint32_t w; | |||
inner_shake256_extract(sc, (void *)buf, sizeof buf); | |||
w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; | |||
if (w < 61445) { | |||
while (w >= 12289) { | |||
w -= 12289; | |||
} | |||
*x ++ = (uint16_t)w; | |||
n --; | |||
} | |||
} | |||
} | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON512_AVX2_hash_to_point_ct( | |||
inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn, uint8_t *tmp) { | |||
/* | |||
* Each 16-bit sample is a value in 0..65535. The value is | |||
* kept if it falls in 0..61444 (because 61445 = 5*12289) | |||
* and rejected otherwise; thus, each sample has probability | |||
* about 0.93758 of being selected. | |||
* | |||
* We want to oversample enough to be sure that we will | |||
* have enough values with probability at least 1 - 2^(-256). | |||
* Depending on degree N, this leads to the following | |||
* required oversampling: | |||
* | |||
* logn n oversampling | |||
* 1 2 65 | |||
* 2 4 67 | |||
* 3 8 71 | |||
* 4 16 77 | |||
* 5 32 86 | |||
* 6 64 100 | |||
* 7 128 122 | |||
* 8 256 154 | |||
* 9 512 205 | |||
* 10 1024 287 | |||
* | |||
* If logn >= 7, then the provided temporary buffer is large | |||
* enough. Otherwise, we use a stack buffer of 63 entries | |||
* (i.e. 126 bytes) for the values that do not fit in tmp[]. | |||
*/ | |||
static const uint16_t overtab[] = { | |||
0, /* unused */ | |||
65, | |||
67, | |||
71, | |||
77, | |||
86, | |||
100, | |||
122, | |||
154, | |||
205, | |||
287 | |||
}; | |||
unsigned n, n2, u, m, p, over; | |||
uint16_t *tt1, tt2[63]; | |||
/* | |||
* We first generate m 16-bit value. Values 0..n-1 go to x[]. | |||
* Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[]. | |||
* We also reduce modulo q the values; rejected values are set | |||
* to 0xFFFF. | |||
*/ | |||
n = 1U << logn; | |||
n2 = n << 1; | |||
over = overtab[logn]; | |||
m = n + over; | |||
tt1 = (uint16_t *)tmp; | |||
for (u = 0; u < m; u ++) { | |||
uint8_t buf[2]; | |||
uint32_t w, wr; | |||
inner_shake256_extract(sc, buf, sizeof buf); | |||
w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; | |||
wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); | |||
wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); | |||
wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1)); | |||
wr |= ((w - 61445) >> 31) - 1; | |||
if (u < n) { | |||
x[u] = (uint16_t)wr; | |||
} else if (u < n2) { | |||
tt1[u - n] = (uint16_t)wr; | |||
} else { | |||
tt2[u - n2] = (uint16_t)wr; | |||
} | |||
} | |||
/* | |||
* Now we must "squeeze out" the invalid values. We do this in | |||
* a logarithmic sequence of passes; each pass computes where a | |||
* value should go, and moves it down by 'p' slots if necessary, | |||
* where 'p' uses an increasing powers-of-two scale. It can be | |||
* shown that in all cases where the loop decides that a value | |||
* has to be moved down by p slots, the destination slot is | |||
* "free" (i.e. contains an invalid value). | |||
*/ | |||
for (p = 1; p <= over; p <<= 1) { | |||
unsigned v; | |||
/* | |||
* In the loop below: | |||
* | |||
* - v contains the index of the final destination of | |||
* the value; it is recomputed dynamically based on | |||
* whether values are valid or not. | |||
* | |||
* - u is the index of the value we consider ("source"); | |||
* its address is s. | |||
* | |||
* - The loop may swap the value with the one at index | |||
* u-p. The address of the swap destination is d. | |||
*/ | |||
v = 0; | |||
for (u = 0; u < m; u ++) { | |||
uint16_t *s, *d; | |||
unsigned j, sv, dv, mk; | |||
if (u < n) { | |||
s = &x[u]; | |||
} else if (u < n2) { | |||
s = &tt1[u - n]; | |||
} else { | |||
s = &tt2[u - n2]; | |||
} | |||
sv = *s; | |||
/* | |||
* The value in sv should ultimately go to | |||
* address v, i.e. jump back by u-v slots. | |||
*/ | |||
j = u - v; | |||
/* | |||
* We increment v for the next iteration, but | |||
* only if the source value is valid. The mask | |||
* 'mk' is -1 if the value is valid, 0 otherwise, | |||
* so we _subtract_ mk. | |||
*/ | |||
mk = (sv >> 15) - 1U; | |||
v -= mk; | |||
/* | |||
* In this loop we consider jumps by p slots; if | |||
* u < p then there is nothing more to do. | |||
*/ | |||
if (u < p) { | |||
continue; | |||
} | |||
/* | |||
* Destination for the swap: value at address u-p. | |||
*/ | |||
if ((u - p) < n) { | |||
d = &x[u - p]; | |||
} else if ((u - p) < n2) { | |||
d = &tt1[(u - p) - n]; | |||
} else { | |||
d = &tt2[(u - p) - n2]; | |||
} | |||
dv = *d; | |||
/* | |||
* The swap should be performed only if the source | |||
* is valid AND the jump j has its 'p' bit set. | |||
*/ | |||
mk &= -(((j & p) + 0x1FF) >> 9); | |||
*s = (uint16_t)(sv ^ (mk & (sv ^ dv))); | |||
*d = (uint16_t)(dv ^ (mk & (sv ^ dv))); | |||
} | |||
} | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_is_short( | |||
const int16_t *s1, const int16_t *s2, unsigned logn) { | |||
/* | |||
* We use the l2-norm. Code below uses only 32-bit operations to | |||
* compute the square of the norm with saturation to 2^32-1 if | |||
* the value exceeds 2^31-1. | |||
*/ | |||
size_t n, u; | |||
uint32_t s, ng; | |||
n = (size_t)1 << logn; | |||
s = 0; | |||
ng = 0; | |||
for (u = 0; u < n; u ++) { | |||
int32_t z; | |||
z = s1[u]; | |||
s += (uint32_t)(z * z); | |||
ng |= s; | |||
z = s2[u]; | |||
s += (uint32_t)(z * z); | |||
ng |= s; | |||
} | |||
s |= -(ng >> 31); | |||
/* | |||
* Acceptance bound on the l2-norm is: | |||
* 1.2*1.55*sqrt(q)*sqrt(2*N) | |||
* Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). | |||
*/ | |||
return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_is_short_half( | |||
uint32_t sqn, const int16_t *s2, unsigned logn) { | |||
size_t n, u; | |||
uint32_t ng; | |||
n = (size_t)1 << logn; | |||
ng = -(sqn >> 31); | |||
for (u = 0; u < n; u ++) { | |||
int32_t z; | |||
z = s2[u]; | |||
sqn += (uint32_t)(z * z); | |||
ng |= sqn; | |||
} | |||
sqn |= -(ng >> 31); | |||
/* | |||
* Acceptance bound on the l2-norm is: | |||
* 1.2*1.55*sqrt(q)*sqrt(2*N) | |||
* Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). | |||
*/ | |||
return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); | |||
} |
@@ -0,0 +1,349 @@ | |||
#ifndef PQCLEAN_FALCON512_AVX2_FPR_H | |||
#define PQCLEAN_FALCON512_AVX2_FPR_H | |||
/* | |||
* Floating-point operations. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* ====================================================================== */ | |||
#include <immintrin.h> | |||
#include <math.h> | |||
#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) | |||
#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) | |||
/* | |||
* We wrap the native 'double' type into a structure so that the C compiler | |||
* complains if we inadvertently use raw arithmetic operators on the 'fpr' | |||
* type instead of using the inline functions below. This should have no | |||
* extra runtime cost, since all the functions below are 'inline'. | |||
*/ | |||
typedef struct { | |||
double v; | |||
} fpr; | |||
static inline fpr | |||
FPR(double v) { | |||
fpr x; | |||
x.v = v; | |||
return x; | |||
} | |||
static inline fpr | |||
fpr_of(int64_t i) { | |||
return FPR((double)i); | |||
} | |||
static const fpr fpr_q = { 12289.0 }; | |||
static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 }; | |||
static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 }; | |||
static const fpr fpr_inv_sigma = { .005819826392951607426919370871 }; | |||
static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 }; | |||
static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 }; | |||
static const fpr fpr_log2 = { 0.69314718055994530941723212146 }; | |||
static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 }; | |||
static const fpr fpr_bnorm_max = { 16822.4121 }; | |||
static const fpr fpr_zero = { 0.0 }; | |||
static const fpr fpr_one = { 1.0 }; | |||
static const fpr fpr_two = { 2.0 }; | |||
static const fpr fpr_onehalf = { 0.5 }; | |||
static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 }; | |||
static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 }; | |||
static const fpr fpr_ptwo31 = { 2147483648.0 }; | |||
static const fpr fpr_ptwo31m1 = { 2147483647.0 }; | |||
static const fpr fpr_mtwo31m1 = { -2147483647.0 }; | |||
static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 }; | |||
static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 }; | |||
static const fpr fpr_ptwo63 = { 9223372036854775808.0 }; | |||
static inline int64_t | |||
fpr_rint(fpr x) { | |||
/* | |||
* We do not want to use llrint() since it might be not | |||
* constant-time. | |||
* | |||
* Suppose that x >= 0. If x >= 2^52, then it is already an | |||
* integer. Otherwise, if x < 2^52, then computing x+2^52 will | |||
* yield a value that will be rounded to the nearest integer | |||
* with exactly the right rules (round-to-nearest-even). | |||
* | |||
* In order to have constant-time processing, we must do the | |||
* computation for both x >= 0 and x < 0 cases, and use a | |||
* cast to an integer to access the sign and select the proper | |||
* value. Such casts also allow us to find out if |x| < 2^52. | |||
*/ | |||
int64_t sx, tx, rp, rn, m; | |||
uint32_t ub; | |||
sx = (int64_t)(x.v - 1.0); | |||
tx = (int64_t)x.v; | |||
rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496; | |||
rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496; | |||
/* | |||
* If tx >= 2^52 or tx < -2^52, then result is tx. | |||
* Otherwise, if sx >= 0, then result is rp. | |||
* Otherwise, result is rn. We use the fact that when x is | |||
* close to 0 (|x| <= 0.25) then both rp and rn are correct; | |||
* and if x is not close to 0, then trunc(x-1.0) yields the | |||
* appropriate sign. | |||
*/ | |||
/* | |||
* Clamp rp to zero if tx < 0. | |||
* Clamp rn to zero if tx >= 0. | |||
*/ | |||
m = sx >> 63; | |||
rn &= m; | |||
rp &= ~m; | |||
/* | |||
* Get the 12 upper bits of tx; if they are not all zeros or | |||
* all ones, then tx >= 2^52 or tx < -2^52, and we clamp both | |||
* rp and rn to zero. Otherwise, we clamp tx to zero. | |||
*/ | |||
ub = (uint32_t)((uint64_t)tx >> 52); | |||
m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31); | |||
rp &= m; | |||
rn &= m; | |||
tx &= ~m; | |||
/* | |||
* Only one of tx, rn or rp (at most) can be non-zero at this | |||
* point. | |||
*/ | |||
return tx | rn | rp; | |||
} | |||
static inline int64_t | |||
fpr_floor(fpr x) { | |||
int64_t r; | |||
/* | |||
* The cast performs a trunc() (rounding toward 0) and thus is | |||
* wrong by 1 for most negative values. The correction below is | |||
* constant-time as long as the compiler turns the | |||
* floating-point conversion result into a 0/1 integer without a | |||
* conditional branch or another non-constant-time construction. | |||
* This should hold on all modern architectures with an FPU (and | |||
* if it is false on a given arch, then chances are that the FPU | |||
* itself is not constant-time, making the point moot). | |||
*/ | |||
r = (int64_t)x.v; | |||
return r - (x.v < (double)r); | |||
} | |||
static inline int64_t | |||
fpr_trunc(fpr x) { | |||
return (int64_t)x.v; | |||
} | |||
static inline fpr | |||
fpr_add(fpr x, fpr y) { | |||
return FPR(x.v + y.v); | |||
} | |||
static inline fpr | |||
fpr_sub(fpr x, fpr y) { | |||
return FPR(x.v - y.v); | |||
} | |||
static inline fpr | |||
fpr_neg(fpr x) { | |||
return FPR(-x.v); | |||
} | |||
static inline fpr | |||
fpr_half(fpr x) { | |||
return FPR(x.v * 0.5); | |||
} | |||
static inline fpr | |||
fpr_double(fpr x) { | |||
return FPR(x.v + x.v); | |||
} | |||
static inline fpr | |||
fpr_mul(fpr x, fpr y) { | |||
return FPR(x.v * y.v); | |||
} | |||
static inline fpr | |||
fpr_sqr(fpr x) { | |||
return FPR(x.v * x.v); | |||
} | |||
static inline fpr | |||
fpr_inv(fpr x) { | |||
return FPR(1.0 / x.v); | |||
} | |||
static inline fpr | |||
fpr_div(fpr x, fpr y) { | |||
return FPR(x.v / y.v); | |||
} | |||
static inline void | |||
fpr_sqrt_avx2(double *t) { | |||
__m128d x; | |||
x = _mm_load1_pd(t); | |||
x = _mm_sqrt_pd(x); | |||
_mm_storel_pd(t, x); | |||
} | |||
static inline fpr | |||
fpr_sqrt(fpr x) { | |||
/* | |||
* We prefer not to have a dependency on libm when it can be | |||
* avoided. On x86, calling the sqrt() libm function inlines | |||
* the relevant opcode (fsqrt or sqrtsd, depending on whether | |||
* the 387 FPU or SSE2 is used for floating-point operations) | |||
* but then makes an optional call to the library function | |||
* for proper error handling, in case the operand is negative. | |||
* | |||
* To avoid this dependency, we use intrinsics or inline assembly | |||
* on recognized platforms: | |||
* | |||
* - If AVX2 is explicitly enabled, then we use SSE2 intrinsics. | |||
* | |||
* - On GCC/Clang with SSE maths, we use SSE2 intrinsics. | |||
* | |||
* - On GCC/Clang on i386, or MSVC on i386, we use inline assembly | |||
* to call the 387 FPU fsqrt opcode. | |||
* | |||
* - On GCC/Clang/XLC on PowerPC, we use inline assembly to call | |||
* the fsqrt opcode (Clang needs a special hack). | |||
* | |||
* - On GCC/Clang on ARM with hardware floating-point, we use | |||
* inline assembly to call the vqsrt.f64 opcode. Due to a | |||
* complex ecosystem of compilers and assembly syntaxes, we | |||
* have to call it "fsqrt" or "fsqrtd", depending on case. | |||
* | |||
* If the platform is not recognized, a call to the system | |||
* library function sqrt() is performed. On some compilers, this | |||
* may actually inline the relevant opcode, and call the library | |||
* function only when the input is invalid (e.g. negative); | |||
* Falcon never actually calls sqrt() on a negative value, but | |||
* the dependency to libm will still be there. | |||
*/ | |||
fpr_sqrt_avx2(&x.v); | |||
return x; | |||
} | |||
static inline int | |||
fpr_lt(fpr x, fpr y) { | |||
return x.v < y.v; | |||
} | |||
static inline uint64_t | |||
fpr_expm_p63(fpr x, fpr ccs) { | |||
/* | |||
* Polynomial approximation of exp(-x) is taken from FACCT: | |||
* https://eprint.iacr.org/2018/1234 | |||
* Specifically, values are extracted from the implementation | |||
* referenced from the FACCT article, and available at: | |||
* https://github.com/raykzhao/gaussian | |||
* Tests over more than 24 billions of random inputs in the | |||
* 0..log(2) range have never shown a deviation larger than | |||
* 2^(-50) from the true mathematical value. | |||
*/ | |||
/* | |||
* AVX2 implementation uses more operations than Horner's method, | |||
* but with a lower expression tree depth. This helps because | |||
* additions and multiplications have a latency of 4 cycles on | |||
* a Skylake, but the CPU can issue two of them per cycle. | |||
*/ | |||
static const union { | |||
double d[12]; | |||
__m256d v[3]; | |||
} c = { | |||
{ | |||
0.999999999999994892974086724280, | |||
0.500000000000019206858326015208, | |||
0.166666666666984014666397229121, | |||
0.041666666666110491190622155955, | |||
0.008333333327800835146903501993, | |||
0.001388888894063186997887560103, | |||
0.000198412739277311890541063977, | |||
0.000024801566833585381209939524, | |||
0.000002755586350219122514855659, | |||
0.000000275607356160477811864927, | |||
0.000000025299506379442070029551, | |||
0.000000002073772366009083061987 | |||
} | |||
}; | |||
double d1, d2, d4, d8, y; | |||
__m256d d14, d58, d9c; | |||
d1 = -x.v; | |||
d2 = d1 * d1; | |||
d4 = d2 * d2; | |||
d8 = d4 * d4; | |||
d14 = _mm256_set_pd(d4, d2 * d1, d2, d1); | |||
d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4)); | |||
d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8)); | |||
d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0])); | |||
d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14); | |||
d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58); | |||
d9c = _mm256_hadd_pd(d9c, d9c); | |||
y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c) | |||
+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1)); | |||
y *= ccs.v; | |||
/* | |||
* Final conversion goes through int64_t first, because that's what | |||
* the underlying opcode (vcvttsd2si) will do, and we know that the | |||
* result will fit, since x >= 0 and ccs < 1. If we did the | |||
* conversion directly to uint64_t, then the compiler would add some | |||
* extra code to cover the case of a source value of 2^63 or more, | |||
* and though the alternate path would never be exercised, the | |||
* extra comparison would cost us some cycles. | |||
*/ | |||
return (uint64_t)(int64_t)(y * fpr_ptwo63.v); | |||
} | |||
#define fpr_gm_tab PQCLEAN_FALCON512_AVX2_fpr_gm_tab | |||
extern const fpr fpr_gm_tab[]; | |||
#define fpr_p2_tab PQCLEAN_FALCON512_AVX2_fpr_p2_tab | |||
extern const fpr fpr_p2_tab[]; | |||
/* ====================================================================== */ | |||
#endif |
@@ -0,0 +1,826 @@ | |||
#ifndef PQCLEAN_FALCON512_AVX2_INNER_H | |||
#define PQCLEAN_FALCON512_AVX2_INNER_H | |||
/* | |||
* Internal functions for Falcon. This is not the API intended to be | |||
* used by applications; instead, this internal API provides all the | |||
* primitives on which wrappers build to provide external APIs. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* | |||
* IMPORTANT API RULES | |||
* ------------------- | |||
* | |||
* This API has some non-trivial usage rules: | |||
* | |||
* | |||
* - All public functions (i.e. the non-static ones) must be referenced | |||
* with the PQCLEAN_FALCON512_AVX2_ macro (e.g. PQCLEAN_FALCON512_AVX2_verify_raw for the verify_raw() | |||
* function). That macro adds a prefix to the name, which is | |||
* configurable with the FALCON_PREFIX macro. This allows compiling | |||
* the code into a specific "namespace" and potentially including | |||
* several versions of this code into a single application (e.g. to | |||
* have an AVX2 and a non-AVX2 variants and select the one to use at | |||
* runtime based on availability of AVX2 opcodes). | |||
* | |||
* - Functions that need temporary buffers expects them as a final | |||
* tmp[] array of type uint8_t*, with a size which is documented for | |||
* each function. However, most have some alignment requirements, | |||
* because they will use the array to store 16-bit, 32-bit or 64-bit | |||
* values (e.g. uint64_t or double). The caller must ensure proper | |||
* alignment. What happens on unaligned access depends on the | |||
* underlying architecture, ranging from a slight time penalty | |||
* to immediate termination of the process. | |||
* | |||
* - Some functions rely on specific rounding rules and precision for | |||
* floating-point numbers. On some systems (in particular 32-bit x86 | |||
* with the 387 FPU), this requires setting an hardware control | |||
* word. The caller MUST use set_fpu_cw() to ensure proper precision: | |||
* | |||
* oldcw = set_fpu_cw(2); | |||
* PQCLEAN_FALCON512_AVX2_sign_dyn(...); | |||
* set_fpu_cw(oldcw); | |||
* | |||
* On systems where the native floating-point precision is already | |||
* proper, or integer-based emulation is used, the set_fpu_cw() | |||
* function does nothing, so it can be called systematically. | |||
*/ | |||
#include "fips202.h" | |||
#include "fpr.h" | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
/* | |||
* Some computations with floating-point elements, in particular | |||
* rounding to the nearest integer, rely on operations using _exactly_ | |||
* the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit | |||
* x86, the 387 FPU may be used (depending on the target OS) and, in | |||
* that case, may use more precision bits (i.e. 64 bits, for an 80-bit | |||
* total type length); to prevent miscomputations, we define an explicit | |||
* function that modifies the precision in the FPU control word. | |||
* | |||
* set_fpu_cw() sets the precision to the provided value, and returns | |||
* the previously set precision; callers are supposed to restore the | |||
* previous precision on exit. The correct (52-bit) precision is | |||
* configured with the value "2". On unsupported compilers, or on | |||
* targets other than 32-bit x86, or when the native 'double' type is | |||
* not used, the set_fpu_cw() function does nothing at all. | |||
*/ | |||
static inline unsigned | |||
set_fpu_cw(unsigned x) { | |||
return x; | |||
} | |||
/* ==================================================================== */ | |||
/* | |||
* SHAKE256 implementation (shake.c). | |||
* | |||
* API is defined to be easily replaced with the fips202.h API defined | |||
* as part of PQClean. | |||
*/ | |||
#define inner_shake256_context shake256incctx | |||
#define inner_shake256_init(sc) shake256_inc_init(sc) | |||
#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) | |||
#define inner_shake256_flip(sc) shake256_inc_finalize(sc) | |||
#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) | |||
#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc) | |||
/* ==================================================================== */ | |||
/* | |||
* Encoding/decoding functions (codec.c). | |||
* | |||
* Encoding functions take as parameters an output buffer (out) with | |||
* a given maximum length (max_out_len); returned value is the actual | |||
* number of bytes which have been written. If the output buffer is | |||
* not large enough, then 0 is returned (some bytes may have been | |||
* written to the buffer). If 'out' is NULL, then 'max_out_len' is | |||
* ignored; instead, the function computes and returns the actual | |||
* required output length (in bytes). | |||
* | |||
* Decoding functions take as parameters an input buffer (in) with | |||
* its maximum length (max_in_len); returned value is the actual number | |||
* of bytes that have been read from the buffer. If the provided length | |||
* is too short, then 0 is returned. | |||
* | |||
* Values to encode or decode are vectors of integers, with N = 2^logn | |||
* elements. | |||
* | |||
* Three encoding formats are defined: | |||
* | |||
* - modq: sequence of values modulo 12289, each encoded over exactly | |||
* 14 bits. The encoder and decoder verify that integers are within | |||
* the valid range (0..12288). Values are arrays of uint16. | |||
* | |||
* - trim: sequence of signed integers, a specified number of bits | |||
* each. The number of bits is provided as parameter and includes | |||
* the sign bit. Each integer x must be such that |x| < 2^(bits-1) | |||
* (which means that the -2^(bits-1) value is forbidden); encode and | |||
* decode functions check that property. Values are arrays of | |||
* int16_t or int8_t, corresponding to names 'trim_i16' and | |||
* 'trim_i8', respectively. | |||
* | |||
* - comp: variable-length encoding for signed integers; each integer | |||
* uses a minimum of 9 bits, possibly more. This is normally used | |||
* only for signatures. | |||
* | |||
*/ | |||
size_t PQCLEAN_FALCON512_AVX2_modq_encode(void *out, size_t max_out_len, | |||
const uint16_t *x, unsigned logn); | |||
size_t PQCLEAN_FALCON512_AVX2_trim_i16_encode(void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn, unsigned bits); | |||
size_t PQCLEAN_FALCON512_AVX2_trim_i8_encode(void *out, size_t max_out_len, | |||
const int8_t *x, unsigned logn, unsigned bits); | |||
size_t PQCLEAN_FALCON512_AVX2_comp_encode(void *out, size_t max_out_len, | |||
const int16_t *x, unsigned logn); | |||
size_t PQCLEAN_FALCON512_AVX2_modq_decode(uint16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len); | |||
size_t PQCLEAN_FALCON512_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len); | |||
size_t PQCLEAN_FALCON512_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits, | |||
const void *in, size_t max_in_len); | |||
size_t PQCLEAN_FALCON512_AVX2_comp_decode(int16_t *x, unsigned logn, | |||
const void *in, size_t max_in_len); | |||
/* | |||
* Number of bits for key elements, indexed by logn (1 to 10). This | |||
* is at most 8 bits for all degrees, but some degrees may have shorter | |||
* elements. | |||
*/ | |||
extern const uint8_t PQCLEAN_FALCON512_AVX2_max_fg_bits[]; | |||
extern const uint8_t PQCLEAN_FALCON512_AVX2_max_FG_bits[]; | |||
/* | |||
* Maximum size, in bits, of elements in a signature, indexed by logn | |||
* (1 to 10). The size includes the sign bit. | |||
*/ | |||
extern const uint8_t PQCLEAN_FALCON512_AVX2_max_sig_bits[]; | |||
/* ==================================================================== */ | |||
/* | |||
* Support functions used for both signature generation and signature | |||
* verification (common.c). | |||
*/ | |||
/* | |||
* From a SHAKE256 context (must be already flipped), produce a new | |||
* point. This is the non-constant-time version, which may leak enough | |||
* information to serve as a stop condition on a brute force attack on | |||
* the hashed message (provided that the nonce value is known). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_hash_to_point_vartime(inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn); | |||
/* | |||
* From a SHAKE256 context (must be already flipped), produce a new | |||
* point. The temporary buffer (tmp) must have room for 2*2^logn bytes. | |||
* This function is constant-time but is typically more expensive than | |||
* PQCLEAN_FALCON512_AVX2_hash_to_point_vartime(). | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_hash_to_point_ct(inner_shake256_context *sc, | |||
uint16_t *x, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Tell whether a given vector (2N coordinates, in two halves) is | |||
* acceptable as a signature. This compares the appropriate norm of the | |||
* vector with the acceptance bound. Returned value is 1 on success | |||
* (vector is short enough to be acceptable), 0 otherwise. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn); | |||
/* | |||
* Tell whether a given vector (2N coordinates, in two halves) is | |||
* acceptable as a signature. Instead of the first half s1, this | |||
* function receives the "saturated squared norm" of s1, i.e. the | |||
* sum of the squares of the coordinates of s1 (saturated at 2^32-1 | |||
* if the sum exceeds 2^31-1). | |||
* | |||
* Returned value is 1 on success (vector is short enough to be | |||
* acceptable), 0 otherwise. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn); | |||
/* ==================================================================== */ | |||
/* | |||
* Signature verification functions (vrfy.c). | |||
*/ | |||
/* | |||
* Convert a public key to NTT + Montgomery format. Conversion is done | |||
* in place. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn); | |||
/* | |||
* Internal signature verification code: | |||
* c0[] contains the hashed nonce+message | |||
* s2[] is the decoded signature | |||
* h[] contains the public key, in NTT + Montgomery format | |||
* logn is the degree log | |||
* tmp[] temporary, must have at least 2*2^logn bytes | |||
* Returned value is 1 on success, 0 on error. | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2, | |||
const uint16_t *h, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Compute the public key h[], given the private key elements f[] and | |||
* g[]. This computes h = g/f mod phi mod q, where phi is the polynomial | |||
* modulus. This function returns 1 on success, 0 on error (an error is | |||
* reported if f is not invertible mod phi mod q). | |||
* | |||
* The tmp[] array must have room for at least 2*2^logn elements. | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_compute_public(uint16_t *h, | |||
const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Recompute the fourth private key element. Private key consists in | |||
* four polynomials with small coefficients f, g, F and G, which are | |||
* such that fG - gF = q mod phi; furthermore, f is invertible modulo | |||
* phi and modulo q. This function recomputes G from f, g and F. | |||
* | |||
* The tmp[] array must have room for at least 4*2^logn bytes. | |||
* | |||
* Returned value is 1 in success, 0 on error (f not invertible). | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_complete_private(int8_t *G, | |||
const int8_t *f, const int8_t *g, const int8_t *F, | |||
unsigned logn, uint8_t *tmp); | |||
/* | |||
* Test whether a given polynomial is invertible modulo phi and q. | |||
* Polynomial coefficients are small integers. | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_is_invertible( | |||
const int16_t *s2, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Count the number of elements of value zero in the NTT representation | |||
* of the given polynomial: this is the number of primitive 2n-th roots | |||
* of unity (modulo q = 12289) that are roots of the provided polynomial | |||
* (taken modulo q). | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Internal signature verification with public key recovery: | |||
* h[] receives the public key (NOT in NTT/Montgomery format) | |||
* c0[] contains the hashed nonce+message | |||
* s1[] is the first signature half | |||
* s2[] is the second signature half | |||
* logn is the degree log | |||
* tmp[] temporary, must have at least 2*2^logn bytes | |||
* Returned value is 1 on success, 0 on error. Success is returned if | |||
* the signature is a short enough vector; in that case, the public | |||
* key has been written to h[]. However, the caller must still | |||
* verify that h[] is the correct value (e.g. with regards to a known | |||
* hash of the public key). | |||
* | |||
* h[] may not overlap with any of the other arrays. | |||
* | |||
* tmp[] must have 16-bit alignment. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_verify_recover(uint16_t *h, | |||
const uint16_t *c0, const int16_t *s1, const int16_t *s2, | |||
unsigned logn, uint8_t *tmp); | |||
/* ==================================================================== */ | |||
/* | |||
* Implementation of floating-point real numbers (fpr.h, fpr.c). | |||
*/ | |||
/* | |||
* Real numbers are implemented by an extra header file, included below. | |||
* This is meant to support pluggable implementations. The default | |||
* implementation relies on the C type 'double'. | |||
* | |||
* The included file must define the following types, functions and | |||
* constants: | |||
* | |||
* fpr | |||
* type for a real number | |||
* | |||
* fpr fpr_of(int64_t i) | |||
* cast an integer into a real number; source must be in the | |||
* -(2^63-1)..+(2^63-1) range | |||
* | |||
* fpr fpr_scaled(int64_t i, int sc) | |||
* compute i*2^sc as a real number; source 'i' must be in the | |||
* -(2^63-1)..+(2^63-1) range | |||
* | |||
* fpr fpr_ldexp(fpr x, int e) | |||
* compute x*2^e | |||
* | |||
* int64_t fpr_rint(fpr x) | |||
* round x to the nearest integer; x must be in the -(2^63-1) | |||
* to +(2^63-1) range | |||
* | |||
* int64_t fpr_trunc(fpr x) | |||
* round to an integer; this rounds towards zero; value must | |||
* be in the -(2^63-1) to +(2^63-1) range | |||
* | |||
* fpr fpr_add(fpr x, fpr y) | |||
* compute x + y | |||
* | |||
* fpr fpr_sub(fpr x, fpr y) | |||
* compute x - y | |||
* | |||
* fpr fpr_neg(fpr x) | |||
* compute -x | |||
* | |||
* fpr fpr_half(fpr x) | |||
* compute x/2 | |||
* | |||
* fpr fpr_double(fpr x) | |||
* compute x*2 | |||
* | |||
* fpr fpr_mul(fpr x, fpr y) | |||
* compute x * y | |||
* | |||
* fpr fpr_sqr(fpr x) | |||
* compute x * x | |||
* | |||
* fpr fpr_inv(fpr x) | |||
* compute 1/x | |||
* | |||
* fpr fpr_div(fpr x, fpr y) | |||
* compute x/y | |||
* | |||
* fpr fpr_sqrt(fpr x) | |||
* compute the square root of x | |||
* | |||
* int fpr_lt(fpr x, fpr y) | |||
* return 1 if x < y, 0 otherwise | |||
* | |||
* uint64_t fpr_expm_p63(fpr x) | |||
* return exp(x), assuming that 0 <= x < log(2). Returned value | |||
* is scaled to 63 bits (i.e. it really returns 2^63*exp(-x), | |||
* rounded to the nearest integer). Computation should have a | |||
* precision of at least 45 bits. | |||
* | |||
* const fpr fpr_gm_tab[] | |||
* array of constants for FFT / iFFT | |||
* | |||
* const fpr fpr_p2_tab[] | |||
* precomputed powers of 2 (by index, 0 to 10) | |||
* | |||
* Constants of type 'fpr': | |||
* | |||
* fpr fpr_q 12289 | |||
* fpr fpr_inverse_of_q 1/12289 | |||
* fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2)) | |||
* fpr fpr_inv_sigma 1/(1.55*sqrt(12289)) | |||
* fpr fpr_sigma_min_9 1.291500756233514568549480827642 | |||
* fpr fpr_sigma_min_10 1.311734375905083682667395805765 | |||
* fpr fpr_log2 log(2) | |||
* fpr fpr_inv_log2 1/log(2) | |||
* fpr fpr_bnorm_max 16822.4121 | |||
* fpr fpr_zero 0 | |||
* fpr fpr_one 1 | |||
* fpr fpr_two 2 | |||
* fpr fpr_onehalf 0.5 | |||
* fpr fpr_ptwo31 2^31 | |||
* fpr fpr_ptwo31m1 2^31-1 | |||
* fpr fpr_mtwo31m1 -(2^31-1) | |||
* fpr fpr_ptwo63m1 2^63-1 | |||
* fpr fpr_mtwo63m1 -(2^63-1) | |||
* fpr fpr_ptwo63 2^63 | |||
*/ | |||
/* ==================================================================== */ | |||
/* | |||
* RNG (rng.c). | |||
* | |||
* A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256 | |||
* context (flipped) and is used for bulk pseudorandom generation. | |||
* A system-dependent seed generator is also provided. | |||
*/ | |||
/* | |||
* Obtain a random seed from the system RNG. | |||
* | |||
* Returned value is 1 on success, 0 on error. | |||
*/ | |||
int PQCLEAN_FALCON512_AVX2_get_seed(void *seed, size_t seed_len); | |||
/* | |||
* Structure for a PRNG. This includes a large buffer so that values | |||
* get generated in advance. The 'state' is used to keep the current | |||
* PRNG algorithm state (contents depend on the selected algorithm). | |||
* | |||
* The unions with 'dummy_u64' are there to ensure proper alignment for | |||
* 64-bit direct access. | |||
*/ | |||
typedef struct { | |||
union { | |||
uint8_t d[512]; /* MUST be 512, exactly */ | |||
uint64_t dummy_u64; | |||
} buf; | |||
size_t ptr; | |||
union { | |||
uint8_t d[256]; | |||
uint64_t dummy_u64; | |||
} state; | |||
int type; | |||
} prng; | |||
/* | |||
* Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 | |||
* context (in "flipped" state) to obtain its initial state. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_prng_init(prng *p, inner_shake256_context *src); | |||
/* | |||
* Refill the PRNG buffer. This is normally invoked automatically, and | |||
* is declared here only so that prng_get_u64() may be inlined. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_prng_refill(prng *p); | |||
/* | |||
* Get some bytes from a PRNG. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len); | |||
/* | |||
* Get a 64-bit random value from a PRNG. | |||
*/ | |||
static inline uint64_t | |||
prng_get_u64(prng *p) { | |||
size_t u; | |||
/* | |||
* If there are less than 9 bytes in the buffer, we refill it. | |||
* This means that we may drop the last few bytes, but this allows | |||
* for faster extraction code. Also, it means that we never leave | |||
* an empty buffer. | |||
*/ | |||
u = p->ptr; | |||
if (u >= (sizeof p->buf.d) - 9) { | |||
PQCLEAN_FALCON512_AVX2_prng_refill(p); | |||
u = 0; | |||
} | |||
p->ptr = u + 8; | |||
return (uint64_t)p->buf.d[u + 0] | |||
| ((uint64_t)p->buf.d[u + 1] << 8) | |||
| ((uint64_t)p->buf.d[u + 2] << 16) | |||
| ((uint64_t)p->buf.d[u + 3] << 24) | |||
| ((uint64_t)p->buf.d[u + 4] << 32) | |||
| ((uint64_t)p->buf.d[u + 5] << 40) | |||
| ((uint64_t)p->buf.d[u + 6] << 48) | |||
| ((uint64_t)p->buf.d[u + 7] << 56); | |||
} | |||
/* | |||
* Get an 8-bit random value from a PRNG. | |||
*/ | |||
static inline unsigned | |||
prng_get_u8(prng *p) { | |||
unsigned v; | |||
v = p->buf.d[p->ptr ++]; | |||
if (p->ptr == sizeof p->buf.d) { | |||
PQCLEAN_FALCON512_AVX2_prng_refill(p); | |||
} | |||
return v; | |||
} | |||
/* ==================================================================== */ | |||
/* | |||
* FFT (falcon-fft.c). | |||
* | |||
* A real polynomial is represented as an array of N 'fpr' elements. | |||
* The FFT representation of a real polynomial contains N/2 complex | |||
* elements; each is stored as two real numbers, for the real and | |||
* imaginary parts, respectively. See falcon-fft.c for details on the | |||
* internal representation. | |||
*/ | |||
/* | |||
* Compute FFT in-place: the source array should contain a real | |||
* polynomial (N coefficients); its storage area is reused to store | |||
* the FFT representation of that polynomial (N/2 complex numbers). | |||
* | |||
* 'logn' MUST lie between 1 and 10 (inclusive). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_FFT(fpr *f, unsigned logn); | |||
/* | |||
* Compute the inverse FFT in-place: the source array should contain the | |||
* FFT representation of a real polynomial (N/2 elements); the resulting | |||
* real polynomial (N coefficients of type 'fpr') is written over the | |||
* array. | |||
* | |||
* 'logn' MUST lie between 1 and 10 (inclusive). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_iFFT(fpr *f, unsigned logn); | |||
/* | |||
* Add polynomial b to polynomial a. a and b MUST NOT overlap. This | |||
* function works in both normal and FFT representations. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This | |||
* function works in both normal and FFT representations. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Negate polynomial a. This function works in both normal and FFT | |||
* representations. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_neg(fpr *a, unsigned logn); | |||
/* | |||
* Compute adjoint of polynomial a. This function works only in FFT | |||
* representation. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_adj_fft(fpr *a, unsigned logn); | |||
/* | |||
* Multiply polynomial a with polynomial b. a and b MUST NOT overlap. | |||
* This function works only in FFT representation. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT | |||
* overlap. This function works only in FFT representation. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Multiply polynomial with its own adjoint. This function works only in FFT | |||
* representation. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn); | |||
/* | |||
* Multiply polynomial with a real constant. This function works in both | |||
* normal and FFT representations. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn); | |||
/* | |||
* Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation). | |||
* a and b MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g)) | |||
* (also in FFT representation). Since the result is auto-adjoint, all its | |||
* coordinates in FFT representation are real; as such, only the first N/2 | |||
* values of d[] are filled (the imaginary parts are skipped). | |||
* | |||
* Array d MUST NOT overlap with either a or b. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_invnorm2_fft(fpr *d, | |||
const fpr *a, const fpr *b, unsigned logn); | |||
/* | |||
* Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g) | |||
* (also in FFT representation). Destination d MUST NOT overlap with | |||
* any of the source arrays. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_add_muladj_fft(fpr *d, | |||
const fpr *F, const fpr *G, | |||
const fpr *f, const fpr *g, unsigned logn); | |||
/* | |||
* Multiply polynomial a by polynomial b, where b is autoadjoint. Both | |||
* a and b are in FFT representation. Since b is autoadjoint, all its | |||
* FFT coefficients are real, and the array b contains only N/2 elements. | |||
* a and b MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_mul_autoadj_fft(fpr *a, | |||
const fpr *b, unsigned logn); | |||
/* | |||
* Divide polynomial a by polynomial b, where b is autoadjoint. Both | |||
* a and b are in FFT representation. Since b is autoadjoint, all its | |||
* FFT coefficients are real, and the array b contains only N/2 elements. | |||
* a and b MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_div_autoadj_fft(fpr *a, | |||
const fpr *b, unsigned logn); | |||
/* | |||
* Perform an LDL decomposition of an auto-adjoint matrix G, in FFT | |||
* representation. On input, g00, g01 and g11 are provided (where the | |||
* matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10 | |||
* and d11 values are written in g00, g01 and g11, respectively | |||
* (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]). | |||
* (In fact, d00 = g00, so the g00 operand is left unmodified.) | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_LDL_fft(const fpr *g00, | |||
fpr *g01, fpr *g11, unsigned logn); | |||
/* | |||
* Perform an LDL decomposition of an auto-adjoint matrix G, in FFT | |||
* representation. This is identical to poly_LDL_fft() except that | |||
* g00, g01 and g11 are unmodified; the outputs d11 and l10 are written | |||
* in two other separate buffers provided as extra parameters. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10, | |||
const fpr *g00, const fpr *g01, | |||
const fpr *g11, unsigned logn); | |||
/* | |||
* Apply "split" operation on a polynomial in FFT representation: | |||
* f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1 | |||
* (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_split_fft(fpr *f0, fpr *f1, | |||
const fpr *f, unsigned logn); | |||
/* | |||
* Apply "merge" operation on two polynomials in FFT representation: | |||
* given f0 and f1, polynomials moduo X^(N/2)+1, this function computes | |||
* f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1. | |||
* f MUST NOT overlap with either f0 or f1. | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_poly_merge_fft(fpr *f, | |||
const fpr *f0, const fpr *f1, unsigned logn); | |||
/* ==================================================================== */ | |||
/* | |||
* Key pair generation. | |||
*/ | |||
/* | |||
* Required sizes of the temporary buffer (in bytes). | |||
* | |||
* This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 | |||
* or 2) where it is slightly greater. | |||
*/ | |||
#define FALCON_KEYGEN_TEMP_1 136 | |||
#define FALCON_KEYGEN_TEMP_2 272 | |||
#define FALCON_KEYGEN_TEMP_3 224 | |||
#define FALCON_KEYGEN_TEMP_4 448 | |||
#define FALCON_KEYGEN_TEMP_5 896 | |||
#define FALCON_KEYGEN_TEMP_6 1792 | |||
#define FALCON_KEYGEN_TEMP_7 3584 | |||
#define FALCON_KEYGEN_TEMP_8 7168 | |||
#define FALCON_KEYGEN_TEMP_9 14336 | |||
#define FALCON_KEYGEN_TEMP_10 28672 | |||
/* | |||
* Generate a new key pair. Randomness is extracted from the provided | |||
* SHAKE256 context, which must have already been seeded and flipped. | |||
* The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_* | |||
* macros) and be aligned for the uint32_t, uint64_t and fpr types. | |||
* | |||
* The private key elements are written in f, g, F and G, and the | |||
* public key is written in h. Either or both of G and h may be NULL, | |||
* in which case the corresponding element is not returned (they can | |||
* be recomputed from f, g and F). | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_keygen(inner_shake256_context *rng, | |||
int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, | |||
unsigned logn, uint8_t *tmp); | |||
/* ==================================================================== */ | |||
/* | |||
* Signature generation. | |||
*/ | |||
/* | |||
* Expand a private key into the B0 matrix in FFT representation and | |||
* the LDL tree. All the values are written in 'expanded_key', for | |||
* a total of (8*logn+40)*2^logn bytes. | |||
* | |||
* The tmp[] array must have room for at least 48*2^logn bytes. | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_expand_privkey(fpr *expanded_key, | |||
const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, | |||
unsigned logn, uint8_t *tmp); | |||
/* | |||
* Compute a signature over the provided hashed message (hm); the | |||
* signature value is one short vector. This function uses an | |||
* expanded key (as generated by PQCLEAN_FALCON512_AVX2_expand_privkey()). | |||
* | |||
* The sig[] and hm[] buffers may overlap. | |||
* | |||
* On successful output, the start of the tmp[] buffer contains the s1 | |||
* vector (as int16_t elements). | |||
* | |||
* The minimal size (in bytes) of tmp[] is 48*2^logn bytes. | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng, | |||
const fpr *expanded_key, | |||
const uint16_t *hm, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Compute a signature over the provided hashed message (hm); the | |||
* signature value is one short vector. This function uses a raw | |||
* key and dynamically recompute the B0 matrix and LDL tree; this | |||
* saves RAM since there is no needed for an expanded key, but | |||
* increases the signature cost. | |||
* | |||
* The sig[] and hm[] buffers may overlap. | |||
* | |||
* On successful output, the start of the tmp[] buffer contains the s1 | |||
* vector (as int16_t elements). | |||
* | |||
* The minimal size (in bytes) of tmp[] is 72*2^logn bytes. | |||
* | |||
* tmp[] must have 64-bit alignment. | |||
* This function uses floating-point rounding (see set_fpu_cw()). | |||
*/ | |||
void PQCLEAN_FALCON512_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng, | |||
const int8_t *f, const int8_t *g, | |||
const int8_t *F, const int8_t *G, | |||
const uint16_t *hm, unsigned logn, uint8_t *tmp); | |||
/* | |||
* Internal sampler engine. Exported for tests. | |||
* | |||
* sampler_context wraps around a source of random numbers (PRNG) and | |||
* the sigma_min value (nominally dependent on the degree). | |||
* | |||
* sampler() takes as parameters: | |||
* ctx pointer to the sampler_context structure | |||
* mu center for the distribution | |||
* isigma inverse of the distribution standard deviation | |||
* It returns an integer sampled along the Gaussian distribution centered | |||
* on mu and of standard deviation sigma = 1/isigma. | |||
* | |||
* gaussian0_sampler() takes as parameter a pointer to a PRNG, and | |||
* returns an integer sampled along a half-Gaussian with standard | |||
* deviation sigma0 = 1.8205 (center is 0, returned value is | |||
* nonnegative). | |||
*/ | |||
typedef struct { | |||
prng p; | |||
fpr sigma_min; | |||
} sampler_context; | |||
int PQCLEAN_FALCON512_AVX2_sampler(void *ctx, fpr mu, fpr isigma); | |||
int PQCLEAN_FALCON512_AVX2_gaussian0_sampler(prng *p); | |||
/* ==================================================================== */ | |||
#endif |
@@ -0,0 +1,384 @@ | |||
#include "api.h" | |||
#include "inner.h" | |||
#include "randombytes.h" | |||
#include <stddef.h> | |||
#include <string.h> | |||
/* | |||
* Wrapper for implementing the PQClean API. | |||
*/ | |||
#define NONCELEN 40 | |||
#define SEEDLEN 48 | |||
/* | |||
* Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024) | |||
* | |||
* private key: | |||
* header byte: 0101nnnn | |||
* private f (6 or 5 bits by element, depending on degree) | |||
* private g (6 or 5 bits by element, depending on degree) | |||
* private F (8 bits by element) | |||
* | |||
* public key: | |||
* header byte: 0000nnnn | |||
* public h (14 bits by element) | |||
* | |||
* signature: | |||
* header byte: 0011nnnn | |||
* nonce 40 bytes | |||
* value (12 bits by element) | |||
* | |||
* message + signature: | |||
* signature length (2 bytes, big-endian) | |||
* nonce 40 bytes | |||
* message | |||
* header byte: 0010nnnn | |||
* value (12 bits by element) | |||
* (signature length is 1+len(value), not counting the nonce) | |||
*/ | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { | |||
union { | |||
uint8_t b[FALCON_KEYGEN_TEMP_9]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
int8_t f[512], g[512], F[512]; | |||
uint16_t h[512]; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context rng; | |||
size_t u, v; | |||
/* | |||
* Generate key pair. | |||
*/ | |||
randombytes(seed, sizeof seed); | |||
inner_shake256_init(&rng); | |||
inner_shake256_inject(&rng, seed, sizeof seed); | |||
inner_shake256_flip(&rng); | |||
PQCLEAN_FALCON512_AVX2_keygen(&rng, f, g, F, NULL, h, 9, tmp.b); | |||
inner_shake256_ctx_release(&rng); | |||
/* | |||
* Encode private key. | |||
*/ | |||
sk[0] = 0x50 + 9; | |||
u = 1; | |||
v = PQCLEAN_FALCON512_AVX2_trim_i8_encode( | |||
sk + u, PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES - u, | |||
f, 9, PQCLEAN_FALCON512_AVX2_max_fg_bits[9]); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON512_AVX2_trim_i8_encode( | |||
sk + u, PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES - u, | |||
g, 9, PQCLEAN_FALCON512_AVX2_max_fg_bits[9]); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON512_AVX2_trim_i8_encode( | |||
sk + u, PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES - u, | |||
F, 9, PQCLEAN_FALCON512_AVX2_max_FG_bits[9]); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
if (u != PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES) { | |||
return -1; | |||
} | |||
/* | |||
* Encode public key. | |||
*/ | |||
pk[0] = 0x00 + 9; | |||
v = PQCLEAN_FALCON512_AVX2_modq_encode( | |||
pk + 1, PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES - 1, | |||
h, 9); | |||
if (v != PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) { | |||
return -1; | |||
} | |||
return 0; | |||
} | |||
/* | |||
* Compute the signature. nonce[] receives the nonce and must have length | |||
* NONCELEN bytes. sigbuf[] receives the signature value (without nonce | |||
* or header byte), with *sigbuflen providing the maximum value length and | |||
* receiving the actual value length. | |||
* | |||
* If a signature could be computed but not encoded because it would | |||
* exceed the output buffer size, then a new signature is computed. If | |||
* the provided buffer size is too low, this could loop indefinitely, so | |||
* the caller must provide a size that can accommodate signatures with a | |||
* large enough probability. | |||
* | |||
* Return value: 0 on success, -1 on error. | |||
*/ | |||
static int | |||
do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
union { | |||
uint8_t b[72 * 512]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
int8_t f[512], g[512], F[512], G[512]; | |||
union { | |||
int16_t sig[512]; | |||
uint16_t hm[512]; | |||
} r; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context sc; | |||
size_t u, v; | |||
/* | |||
* Decode the private key. | |||
*/ | |||
if (sk[0] != 0x50 + 9) { | |||
return -1; | |||
} | |||
u = 1; | |||
v = PQCLEAN_FALCON512_AVX2_trim_i8_decode( | |||
f, 9, PQCLEAN_FALCON512_AVX2_max_fg_bits[9], | |||
sk + u, PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES - u); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON512_AVX2_trim_i8_decode( | |||
g, 9, PQCLEAN_FALCON512_AVX2_max_fg_bits[9], | |||
sk + u, PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES - u); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
v = PQCLEAN_FALCON512_AVX2_trim_i8_decode( | |||
F, 9, PQCLEAN_FALCON512_AVX2_max_FG_bits[9], | |||
sk + u, PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES - u); | |||
if (v == 0) { | |||
return -1; | |||
} | |||
u += v; | |||
if (u != PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES) { | |||
return -1; | |||
} | |||
if (!PQCLEAN_FALCON512_AVX2_complete_private(G, f, g, F, 9, tmp.b)) { | |||
return -1; | |||
} | |||
/* | |||
* Create a random nonce (40 bytes). | |||
*/ | |||
randombytes(nonce, NONCELEN); | |||
/* | |||
* Hash message nonce + message into a vector. | |||
*/ | |||
inner_shake256_init(&sc); | |||
inner_shake256_inject(&sc, nonce, NONCELEN); | |||
inner_shake256_inject(&sc, m, mlen); | |||
inner_shake256_flip(&sc); | |||
PQCLEAN_FALCON512_AVX2_hash_to_point_ct(&sc, r.hm, 9, tmp.b); | |||
inner_shake256_ctx_release(&sc); | |||
/* | |||
* Initialize a RNG. | |||
*/ | |||
randombytes(seed, sizeof seed); | |||
inner_shake256_init(&sc); | |||
inner_shake256_inject(&sc, seed, sizeof seed); | |||
inner_shake256_flip(&sc); | |||
/* | |||
* Compute and return the signature. This loops until a signature | |||
* value is found that fits in the provided buffer. | |||
*/ | |||
for (;;) { | |||
PQCLEAN_FALCON512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b); | |||
v = PQCLEAN_FALCON512_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 9); | |||
if (v != 0) { | |||
inner_shake256_ctx_release(&sc); | |||
*sigbuflen = v; | |||
return 0; | |||
} | |||
} | |||
} | |||
/* | |||
* Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[] | |||
* (of size sigbuflen) contains the signature value, not including the | |||
* header byte or nonce. Return value is 0 on success, -1 on error. | |||
*/ | |||
static int | |||
do_verify( | |||
const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
union { | |||
uint8_t b[2 * 512]; | |||
uint64_t dummy_u64; | |||
fpr dummy_fpr; | |||
} tmp; | |||
uint16_t h[512], hm[512]; | |||
int16_t sig[512]; | |||
inner_shake256_context sc; | |||
/* | |||
* Decode public key. | |||
*/ | |||
if (pk[0] != 0x00 + 9) { | |||
return -1; | |||
} | |||
if (PQCLEAN_FALCON512_AVX2_modq_decode(h, 9, | |||
pk + 1, PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) | |||
!= PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) { | |||
return -1; | |||
} | |||
PQCLEAN_FALCON512_AVX2_to_ntt_monty(h, 9); | |||
/* | |||
* Decode signature. | |||
*/ | |||
if (sigbuflen == 0) { | |||
return -1; | |||
} | |||
if (PQCLEAN_FALCON512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen) != sigbuflen) { | |||
return -1; | |||
} | |||
/* | |||
* Hash nonce + message into a vector. | |||
*/ | |||
inner_shake256_init(&sc); | |||
inner_shake256_inject(&sc, nonce, NONCELEN); | |||
inner_shake256_inject(&sc, m, mlen); | |||
inner_shake256_flip(&sc); | |||
PQCLEAN_FALCON512_AVX2_hash_to_point_ct(&sc, hm, 9, tmp.b); | |||
inner_shake256_ctx_release(&sc); | |||
/* | |||
* Verify signature. | |||
*/ | |||
if (!PQCLEAN_FALCON512_AVX2_verify_raw(hm, sig, h, 9, tmp.b)) { | |||
return -1; | |||
} | |||
return 0; | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
/* | |||
* The PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES constant is used for | |||
* the signed message object (as produced by PQCLEAN_FALCON512_AVX2_crypto_sign()) | |||
* and includes a two-byte length value, so we take care here | |||
* to only generate signatures that are two bytes shorter than | |||
* the maximum. This is done to ensure that PQCLEAN_FALCON512_AVX2_crypto_sign() | |||
* and PQCLEAN_FALCON512_AVX2_crypto_sign_signature() produce the exact same signature | |||
* value, if used on the same message, with the same private key, | |||
* and using the same output from randombytes() (this is for | |||
* reproducibility of tests). | |||
*/ | |||
size_t vlen; | |||
vlen = PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES - NONCELEN - 3; | |||
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) { | |||
return -1; | |||
} | |||
sig[0] = 0x30 + 9; | |||
*siglen = 1 + NONCELEN + vlen; | |||
return 0; | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
if (siglen < 1 + NONCELEN) { | |||
return -1; | |||
} | |||
if (sig[0] != 0x30 + 9) { | |||
return -1; | |||
} | |||
return do_verify(sig + 1, | |||
sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk); | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
uint8_t *pm, *sigbuf; | |||
size_t sigbuflen; | |||
/* | |||
* Move the message to its final location; this is a memmove() so | |||
* it handles overlaps properly. | |||
*/ | |||
memmove(sm + 2 + NONCELEN, m, mlen); | |||
pm = sm + 2 + NONCELEN; | |||
sigbuf = pm + 1 + mlen; | |||
sigbuflen = PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES - NONCELEN - 3; | |||
if (do_sign(sm + 2, sigbuf, &sigbuflen, pm, mlen, sk) < 0) { | |||
return -1; | |||
} | |||
pm[mlen] = 0x20 + 9; | |||
sigbuflen ++; | |||
sm[0] = (uint8_t)(sigbuflen >> 8); | |||
sm[1] = (uint8_t)sigbuflen; | |||
*smlen = mlen + 2 + NONCELEN + sigbuflen; | |||
return 0; | |||
} | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk) { | |||
const uint8_t *sigbuf; | |||
size_t pmlen, sigbuflen; | |||
if (smlen < 3 + NONCELEN) { | |||
return -1; | |||
} | |||
sigbuflen = ((size_t)sm[0] << 8) | (size_t)sm[1]; | |||
if (sigbuflen < 2 || sigbuflen > (smlen - NONCELEN - 2)) { | |||
return -1; | |||
} | |||
sigbuflen --; | |||
pmlen = smlen - NONCELEN - 3 - sigbuflen; | |||
if (sm[2 + NONCELEN + pmlen] != 0x20 + 9) { | |||
return -1; | |||
} | |||
sigbuf = sm + 2 + NONCELEN + pmlen + 1; | |||
/* | |||
* The 2-byte length header and the one-byte signature header | |||
* have been verified. Nonce is at sm+2, followed by the message | |||
* itself. Message length is in pmlen. sigbuf/sigbuflen point to | |||
* the signature value (excluding the header byte). | |||
*/ | |||
if (do_verify(sm + 2, sigbuf, sigbuflen, | |||
sm + 2 + NONCELEN, pmlen, pk) < 0) { | |||
return -1; | |||
} | |||
/* | |||
* Signature is correct, we just have to copy/move the message | |||
* to its final destination. The memmove() properly handles | |||
* overlaps. | |||
*/ | |||
memmove(m, sm + 2 + NONCELEN, pmlen); | |||
*mlen = pmlen; | |||
return 0; | |||
} |
@@ -0,0 +1,195 @@ | |||
#include "inner.h" | |||
#include <assert.h> | |||
/* | |||
* PRNG and interface to the system RNG. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* | |||
* Include relevant system header files. For Win32, this will also need | |||
* linking with advapi32.dll, which we trigger with an appropriate #pragma. | |||
*/ | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_get_seed(void *seed, size_t len) { | |||
(void)seed; | |||
if (len == 0) { | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON512_AVX2_prng_init(prng *p, inner_shake256_context *src) { | |||
inner_shake256_extract(src, p->state.d, 56); | |||
PQCLEAN_FALCON512_AVX2_prng_refill(p); | |||
} | |||
/* | |||
* PRNG based on ChaCha20. | |||
* | |||
* State consists in key (32 bytes) then IV (16 bytes) and block counter | |||
* (8 bytes). Normally, we should not care about local endianness (this | |||
* is for a PRNG), but for the NIST competition we need reproducible KAT | |||
* vectors that work across architectures, so we enforce little-endian | |||
* interpretation where applicable. Moreover, output words are "spread | |||
* out" over the output buffer with the interleaving pattern that is | |||
* naturally obtained from the AVX2 implementation that runs eight | |||
* ChaCha20 instances in parallel. | |||
* | |||
* The block counter is XORed into the first 8 bytes of the IV. | |||
*/ | |||
void | |||
PQCLEAN_FALCON512_AVX2_prng_refill(prng *p) { | |||
static const uint32_t CW[] = { | |||
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 | |||
}; | |||
uint64_t cc; | |||
size_t u; | |||
int i; | |||
uint32_t *sw; | |||
union { | |||
uint32_t w[16]; | |||
__m256i y[2]; /* for alignment */ | |||
} t; | |||
__m256i state[16], init[16]; | |||
sw = (uint32_t *)p->state.d; | |||
/* | |||
* XOR next counter values into state. | |||
*/ | |||
cc = *(uint64_t *)(p->state.d + 48); | |||
for (u = 0; u < 8; u ++) { | |||
t.w[u] = (uint32_t)(cc + u); | |||
t.w[u + 8] = (uint32_t)((cc + u) >> 32); | |||
} | |||
*(uint64_t *)(p->state.d + 48) = cc + 8; | |||
/* | |||
* Load state. | |||
*/ | |||
for (u = 0; u < 4; u ++) { | |||
state[u] = init[u] = | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)CW[u])); | |||
} | |||
for (u = 0; u < 10; u ++) { | |||
state[u + 4] = init[u + 4] = | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[u])); | |||
} | |||
state[14] = init[14] = _mm256_xor_si256( | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[10])), | |||
_mm256_loadu_si256((__m256i *)&t.w[0])); | |||
state[15] = init[15] = _mm256_xor_si256( | |||
_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[11])), | |||
_mm256_loadu_si256((__m256i *)&t.w[8])); | |||
/* | |||
* Do all rounds. | |||
*/ | |||
for (i = 0; i < 10; i ++) { | |||
#define QROUND(a, b, c, d) do { \ | |||
state[a] = _mm256_add_epi32(state[a], state[b]); \ | |||
state[d] = _mm256_xor_si256(state[d], state[a]); \ | |||
state[d] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[d], 16), \ | |||
_mm256_srli_epi32(state[d], 16)); \ | |||
state[c] = _mm256_add_epi32(state[c], state[d]); \ | |||
state[b] = _mm256_xor_si256(state[b], state[c]); \ | |||
state[b] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[b], 12), \ | |||
_mm256_srli_epi32(state[b], 20)); \ | |||
state[a] = _mm256_add_epi32(state[a], state[b]); \ | |||
state[d] = _mm256_xor_si256(state[d], state[a]); \ | |||
state[d] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[d], 8), \ | |||
_mm256_srli_epi32(state[d], 24)); \ | |||
state[c] = _mm256_add_epi32(state[c], state[d]); \ | |||
state[b] = _mm256_xor_si256(state[b], state[c]); \ | |||
state[b] = _mm256_or_si256( \ | |||
_mm256_slli_epi32(state[b], 7), \ | |||
_mm256_srli_epi32(state[b], 25)); \ | |||
} while (0) | |||
QROUND( 0, 4, 8, 12); | |||
QROUND( 1, 5, 9, 13); | |||
QROUND( 2, 6, 10, 14); | |||
QROUND( 3, 7, 11, 15); | |||
QROUND( 0, 5, 10, 15); | |||
QROUND( 1, 6, 11, 12); | |||
QROUND( 2, 7, 8, 13); | |||
QROUND( 3, 4, 9, 14); | |||
#undef QROUND | |||
} | |||
/* | |||
* Add initial state back and encode the result in the destination | |||
* buffer. We can dump the AVX2 values "as is" because the non-AVX2 | |||
* code uses a compatible order of values. | |||
*/ | |||
for (u = 0; u < 16; u ++) { | |||
_mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], | |||
_mm256_add_epi32(state[u], init[u])); | |||
} | |||
p->ptr = 0; | |||
} | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) { | |||
uint8_t *buf; | |||
buf = dst; | |||
while (len > 0) { | |||
size_t clen; | |||
clen = (sizeof p->buf.d) - p->ptr; | |||
if (clen > len) { | |||
clen = len; | |||
} | |||
memcpy(buf, p->buf.d, clen); | |||
buf += clen; | |||
len -= clen; | |||
p->ptr += clen; | |||
if (p->ptr == sizeof p->buf.d) { | |||
PQCLEAN_FALCON512_AVX2_prng_refill(p); | |||
} | |||
} | |||
} |
@@ -0,0 +1,853 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon signature verification. | |||
* | |||
* ==========================(LICENSE BEGIN)============================ | |||
* | |||
* Copyright (c) 2017-2019 Falcon Project | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
* ===========================(LICENSE END)============================= | |||
* | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
/* ===================================================================== */ | |||
/* | |||
* Constants for NTT. | |||
* | |||
* n = 2^logn (2 <= n <= 1024) | |||
* phi = X^n + 1 | |||
* q = 12289 | |||
* q0i = -1/q mod 2^16 | |||
* R = 2^16 mod q | |||
* R2 = 2^32 mod q | |||
*/ | |||
#define Q 12289 | |||
#define Q0I 12287 | |||
#define R 4091 | |||
#define R2 10952 | |||
/* | |||
* Table for NTT, binary case: | |||
* GMb[x] = R*(g^rev(x)) mod q | |||
* where g = 7 (it is a 2048-th primitive root of 1 modulo q) | |||
* and rev() is the bit-reversal function over 10 bits. | |||
*/ | |||
static const uint16_t GMb[] = { | |||
4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759, | |||
1591, 6399, 9477, 5266, 586, 5825, 7538, 9710, | |||
1134, 6407, 1711, 965, 7099, 7674, 3743, 6442, | |||
10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180, | |||
12210, 6240, 997, 117, 4783, 4407, 1549, 7072, | |||
2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042, | |||
12189, 432, 10751, 1237, 7610, 1534, 3983, 7863, | |||
2181, 6308, 8720, 6570, 4843, 1690, 14, 3872, | |||
5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340, | |||
1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045, | |||
3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180, | |||
9277, 6130, 3323, 883, 10469, 489, 1502, 2851, | |||
11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195, | |||
730, 1762, 3854, 2030, 5892, 10922, 9020, 5274, | |||
9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446, | |||
7613, 9386, 834, 7703, 6836, 3403, 5351, 12276, | |||
3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525, | |||
10401, 2749, 7338, 10574, 6040, 943, 9330, 1477, | |||
6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680, | |||
8188, 6902, 3533, 9807, 6090, 727, 10099, 7003, | |||
6945, 1949, 9731, 10559, 6057, 378, 7871, 8763, | |||
8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821, | |||
5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159, | |||
1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188, | |||
737, 3698, 4699, 5753, 9046, 3687, 16, 914, | |||
5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381, | |||
10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357, | |||
6409, 8197, 2962, 6375, 5064, 6634, 5625, 278, | |||
932, 10229, 8927, 7642, 351, 9298, 237, 5858, | |||
7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204, | |||
4602, 1748, 11300, 340, 3711, 4614, 300, 10993, | |||
5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654, | |||
3835, 5553, 1224, 8476, 9237, 3845, 250, 11209, | |||
4225, 6326, 9680, 12254, 4136, 2778, 692, 8808, | |||
6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433, | |||
6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416, | |||
8418, 10824, 11986, 5733, 876, 7030, 2167, 2436, | |||
3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434, | |||
7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328, | |||
8557, 1867, 9454, 2416, 3816, 9076, 686, 5393, | |||
2523, 4339, 6115, 619, 937, 2834, 7775, 3279, | |||
2363, 7488, 6112, 5056, 824, 10204, 11690, 1113, | |||
2727, 9848, 896, 2028, 5075, 2654, 10464, 7884, | |||
12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520, | |||
1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399, | |||
11192, 315, 4511, 1158, 6061, 6751, 11865, 357, | |||
7367, 4550, 983, 8534, 8352, 10126, 7530, 9253, | |||
4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652, | |||
3374, 11477, 1753, 292, 8681, 2806, 10378, 12188, | |||
5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928, | |||
4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650, | |||
7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344, | |||
8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561, | |||
6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114, | |||
7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323, | |||
10438, 9471, 1271, 408, 6911, 3079, 360, 8276, | |||
11535, 9156, 9049, 11539, 850, 8617, 784, 7919, | |||
8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600, | |||
9779, 1012, 721, 2784, 6676, 6552, 5348, 4424, | |||
6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333, | |||
8801, 9661, 7308, 5788, 4910, 909, 11613, 4395, | |||
8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216, | |||
4296, 11918, 695, 4371, 9793, 4884, 2411, 10230, | |||
2650, 841, 3890, 10231, 7248, 8505, 11196, 6688, | |||
4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868, | |||
11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525, | |||
7938, 7982, 11977, 6755, 537, 4562, 1623, 8227, | |||
11453, 7544, 906, 11816, 9548, 10858, 9703, 2815, | |||
11736, 6813, 6979, 819, 8903, 6271, 10843, 348, | |||
7514, 8339, 6439, 694, 852, 5659, 2781, 3716, | |||
11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885, | |||
2978, 7289, 11884, 9123, 9323, 11830, 98, 2526, | |||
2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224, | |||
10871, 8092, 9651, 5989, 7140, 8480, 1670, 159, | |||
10923, 4918, 128, 7312, 725, 9157, 5006, 6393, | |||
3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668, | |||
3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365, | |||
5110, 45, 2400, 1921, 4377, 2720, 1695, 51, | |||
2808, 650, 1896, 9997, 9971, 11980, 8098, 4833, | |||
4135, 4257, 5838, 4765, 10985, 11532, 590, 12198, | |||
482, 12173, 2006, 7064, 10018, 3912, 12016, 10519, | |||
11362, 6954, 2210, 284, 5413, 6601, 3865, 10339, | |||
11188, 6231, 517, 9564, 11281, 3863, 1210, 4604, | |||
8160, 11447, 153, 7204, 5763, 5089, 9248, 12154, | |||
11748, 1354, 6672, 179, 5532, 2646, 5941, 12185, | |||
862, 3158, 477, 7279, 5678, 7914, 4254, 302, | |||
2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824, | |||
10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449, | |||
5159, 1308, 8315, 3404, 1877, 1231, 112, 6398, | |||
11724, 12272, 7286, 1459, 12274, 9896, 3456, 800, | |||
1397, 10678, 103, 7420, 7976, 936, 764, 632, | |||
7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946, | |||
6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139, | |||
4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850, | |||
7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217, | |||
10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711, | |||
2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729, | |||
4997, 7415, 6315, 12044, 4374, 7157, 4844, 211, | |||
8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875, | |||
8192, 986, 7527, 1401, 870, 3615, 8465, 2756, | |||
9770, 2034, 10168, 3264, 6132, 54, 2880, 4763, | |||
11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038, | |||
2567, 708, 893, 6465, 4962, 10024, 2090, 5718, | |||
10743, 780, 4733, 4623, 2134, 2087, 4802, 884, | |||
5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664, | |||
4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791, | |||
6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032, | |||
11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062, | |||
8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348, | |||
4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499, | |||
2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326, | |||
5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830, | |||
11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582, | |||
3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762, | |||
7496, 10383, 755, 1654, 12053, 4952, 10134, 4394, | |||
6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674, | |||
10358, 4901, 7414, 8771, 710, 6764, 8462, 7193, | |||
5371, 7274, 11084, 290, 7864, 6827, 11822, 2509, | |||
6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105, | |||
11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776, | |||
7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277, | |||
9182, 11456, 633, 10046, 11554, 5633, 9587, 2333, | |||
7008, 7084, 5047, 7199, 9865, 8997, 569, 6390, | |||
10845, 9679, 8268, 11472, 4203, 1997, 2, 9331, | |||
162, 6182, 2000, 3649, 9792, 6363, 7557, 6187, | |||
8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067, | |||
5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165, | |||
6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949 | |||
}; | |||
/* | |||
* Table for inverse NTT, binary case: | |||
* iGMb[x] = R*((1/g)^rev(x)) mod q | |||
* Since g = 7, 1/g = 8778 mod 12289. | |||
*/ | |||
static const uint16_t iGMb[] = { | |||
4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329, | |||
2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698, | |||
3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875, | |||
5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155, | |||
8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108, | |||
4426, 8306, 10755, 4679, 11052, 1538, 11857, 100, | |||
8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460, | |||
5217, 10740, 7882, 7506, 12172, 11292, 6049, 79, | |||
13, 6938, 8886, 5453, 4586, 11455, 2903, 4676, | |||
9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110, | |||
7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559, | |||
11094, 2211, 1808, 7319, 48, 9547, 2560, 1228, | |||
9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012, | |||
6109, 2796, 2203, 1652, 711, 7004, 1053, 8973, | |||
5244, 1517, 9322, 11269, 900, 3888, 11133, 10736, | |||
4949, 7616, 9974, 4746, 10270, 126, 2921, 6720, | |||
6635, 6543, 1582, 4868, 42, 673, 2240, 7219, | |||
1296, 11989, 7675, 8578, 11949, 989, 10541, 7687, | |||
7085, 8487, 1004, 10236, 4703, 163, 9143, 4597, | |||
6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357, | |||
12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880, | |||
6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556, | |||
6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103, | |||
11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552, | |||
6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822, | |||
9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609, | |||
3468, 4659, 625, 2700, 7738, 3443, 3060, 3388, | |||
3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344, | |||
5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101, | |||
4609, 8605, 8226, 144, 5656, 8704, 2621, 5424, | |||
10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888, | |||
3764, 39, 8219, 2080, 2502, 1469, 10550, 8709, | |||
5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639, | |||
2059, 9878, 7405, 2496, 7918, 11594, 371, 7993, | |||
3073, 10326, 40, 10004, 9245, 7987, 5603, 4051, | |||
7894, 676, 11380, 7379, 6501, 4981, 2628, 3488, | |||
10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473, | |||
7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510, | |||
6689, 386, 4462, 105, 2076, 10443, 119, 3955, | |||
4370, 11505, 3672, 11439, 750, 3240, 3133, 754, | |||
4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851, | |||
4966, 8181, 2688, 6205, 6814, 926, 2936, 4327, | |||
10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255, | |||
728, 7569, 6056, 10432, 11036, 2452, 2811, 3787, | |||
945, 8998, 1244, 8815, 11017, 11218, 5894, 4325, | |||
4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707, | |||
1361, 9812, 2949, 11265, 10301, 9108, 478, 6489, | |||
101, 1911, 9483, 3608, 11997, 10536, 812, 8915, | |||
637, 8159, 5299, 9128, 3512, 8290, 7068, 7922, | |||
3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922, | |||
11932, 424, 5538, 6228, 11131, 7778, 11974, 1097, | |||
2890, 10027, 2569, 2250, 2352, 821, 2550, 11016, | |||
7769, 136, 617, 3157, 5889, 9219, 6855, 120, | |||
4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562, | |||
11176, 599, 2085, 11465, 7233, 6177, 4801, 9926, | |||
9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766, | |||
6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732, | |||
7961, 1457, 10857, 8069, 832, 1628, 3410, 4900, | |||
10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847, | |||
9853, 10122, 5259, 11413, 6556, 303, 1465, 3871, | |||
4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852, | |||
3856, 928, 4933, 8530, 1871, 2184, 5571, 5879, | |||
3481, 11597, 9511, 8153, 35, 2609, 5963, 8064, | |||
1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454, | |||
2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028, | |||
2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795, | |||
9222, 10837, 280, 8583, 3270, 6753, 2354, 3779, | |||
6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127, | |||
2958, 12287, 10292, 8086, 817, 4021, 2610, 1444, | |||
5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281, | |||
9956, 2702, 6656, 735, 2243, 11656, 833, 3107, | |||
6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278, | |||
3513, 9769, 3025, 779, 9433, 3392, 7437, 668, | |||
10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711, | |||
9780, 467, 5462, 4425, 11999, 1205, 5015, 6918, | |||
5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931, | |||
6615, 1541, 8708, 260, 3385, 4792, 4391, 5697, | |||
7895, 2155, 7337, 236, 10635, 11534, 1906, 4793, | |||
9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556, | |||
707, 1088, 4936, 678, 10245, 18, 5684, 960, | |||
4459, 7957, 226, 2451, 6, 8874, 320, 6298, | |||
8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876, | |||
9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679, | |||
7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378, | |||
5227, 952, 4319, 9810, 4356, 3088, 11118, 840, | |||
6257, 486, 6000, 1342, 10382, 6017, 4798, 5489, | |||
4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037, | |||
1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917, | |||
11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546, | |||
6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722, | |||
2251, 11199, 5356, 7408, 2861, 4003, 9215, 484, | |||
7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519, | |||
9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097, | |||
2414, 6496, 9953, 10554, 808, 2999, 2130, 4286, | |||
12078, 7445, 5132, 7915, 245, 5974, 4874, 7292, | |||
7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022, | |||
9578, 8934, 11074, 9498, 294, 4711, 3391, 1377, | |||
9072, 10189, 4569, 10890, 9909, 6923, 53, 4653, | |||
439, 10253, 7028, 10207, 8343, 1141, 2556, 7601, | |||
8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765, | |||
10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293, | |||
11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892, | |||
11489, 8833, 2393, 15, 10830, 5003, 17, 565, | |||
5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130, | |||
5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020, | |||
2465, 8191, 384, 2642, 2729, 5399, 2175, 9396, | |||
11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427, | |||
104, 6348, 9643, 6757, 12110, 5617, 10935, 541, | |||
135, 3041, 7200, 6526, 5085, 12136, 842, 4129, | |||
7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101, | |||
1950, 8424, 5688, 6876, 12005, 10079, 5335, 927, | |||
1770, 273, 8377, 2271, 5225, 10283, 116, 11807, | |||
91, 11699, 757, 1304, 7524, 6451, 8032, 8154, | |||
7456, 4191, 309, 2318, 2292, 10393, 11639, 9481, | |||
12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179, | |||
3924, 3188, 367, 2077, 336, 5384, 5631, 8596, | |||
4621, 1775, 8866, 451, 6108, 1317, 6246, 8795, | |||
5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366, | |||
12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418, | |||
10065, 4156, 8373, 8644, 10445, 882, 8158, 10173, | |||
9763, 12191, 459, 2966, 3166, 405, 5000, 9311, | |||
6404, 8986, 1551, 8175, 3630, 10766, 9265, 700, | |||
8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775, | |||
11941, 1446, 6018, 3386, 11470, 5310, 5476, 553, | |||
9474, 2586, 1431, 2741, 473, 11383, 4745, 836, | |||
4062, 10666, 7727, 11752, 5534, 312, 4307, 4351, | |||
5764, 8679, 8381, 8187, 5, 7395, 4363, 1152, | |||
5421, 5231, 6473, 436, 7567, 8603, 6229, 8230 | |||
}; | |||
/* | |||
* Reduce a small signed integer modulo q. The source integer MUST | |||
* be between -q/2 and +q/2. | |||
*/ | |||
static inline uint32_t | |||
mq_conv_small(int x) { | |||
/* | |||
* If x < 0, the cast to uint32_t will set the high bit to 1. | |||
*/ | |||
uint32_t y; | |||
y = (uint32_t)x; | |||
y += Q & -(y >> 31); | |||
return y; | |||
} | |||
/* | |||
* Addition modulo q. Operands must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_add(uint32_t x, uint32_t y) { | |||
/* | |||
* We compute x + y - q. If the result is negative, then the | |||
* high bit will be set, and 'd >> 31' will be equal to 1; | |||
* thus '-(d >> 31)' will be an all-one pattern. Otherwise, | |||
* it will be an all-zero pattern. In other words, this | |||
* implements a conditional addition of q. | |||
*/ | |||
uint32_t d; | |||
d = x + y - Q; | |||
d += Q & -(d >> 31); | |||
return d; | |||
} | |||
/* | |||
* Subtraction modulo q. Operands must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_sub(uint32_t x, uint32_t y) { | |||
/* | |||
* As in mq_add(), we use a conditional addition to ensure the | |||
* result is in the 0..q-1 range. | |||
*/ | |||
uint32_t d; | |||
d = x - y; | |||
d += Q & -(d >> 31); | |||
return d; | |||
} | |||
/* | |||
* Division by 2 modulo q. Operand must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_rshift1(uint32_t x) { | |||
x += Q & -(x & 1); | |||
return (x >> 1); | |||
} | |||
/* | |||
* Montgomery multiplication modulo q. If we set R = 2^16 mod q, then | |||
* this function computes: x * y / R mod q | |||
* Operands must be in the 0..q-1 range. | |||
*/ | |||
static inline uint32_t | |||
mq_montymul(uint32_t x, uint32_t y) { | |||
uint32_t z, w; | |||
/* | |||
* We compute x*y + k*q with a value of k chosen so that the 16 | |||
* low bits of the result are 0. We can then shift the value. | |||
* After the shift, result may still be larger than q, but it | |||
* will be lower than 2*q, so a conditional subtraction works. | |||
*/ | |||
z = x * y; | |||
w = ((z * Q0I) & 0xFFFF) * Q; | |||
/* | |||
* When adding z and w, the result will have its low 16 bits | |||
* equal to 0. Since x, y and z are lower than q, the sum will | |||
* be no more than (2^15 - 1) * q + (q - 1)^2, which will | |||
* fit on 29 bits. | |||
*/ | |||
z = (z + w) >> 16; | |||
/* | |||
* After the shift, analysis shows that the value will be less | |||
* than 2q. We do a subtraction then conditional subtraction to | |||
* ensure the result is in the expected range. | |||
*/ | |||
z -= Q; | |||
z += Q & -(z >> 31); | |||
return z; | |||
} | |||
/* | |||
* Montgomery squaring (computes (x^2)/R). | |||
*/ | |||
static inline uint32_t | |||
mq_montysqr(uint32_t x) { | |||
return mq_montymul(x, x); | |||
} | |||
/* | |||
* Divide x by y modulo q = 12289. | |||
*/ | |||
static inline uint32_t | |||
mq_div_12289(uint32_t x, uint32_t y) { | |||
/* | |||
* We invert y by computing y^(q-2) mod q. | |||
* | |||
* We use the following addition chain for exponent e = 12287: | |||
* | |||
* e0 = 1 | |||
* e1 = 2 * e0 = 2 | |||
* e2 = e1 + e0 = 3 | |||
* e3 = e2 + e1 = 5 | |||
* e4 = 2 * e3 = 10 | |||
* e5 = 2 * e4 = 20 | |||
* e6 = 2 * e5 = 40 | |||
* e7 = 2 * e6 = 80 | |||
* e8 = 2 * e7 = 160 | |||
* e9 = e8 + e2 = 163 | |||
* e10 = e9 + e8 = 323 | |||
* e11 = 2 * e10 = 646 | |||
* e12 = 2 * e11 = 1292 | |||
* e13 = e12 + e9 = 1455 | |||
* e14 = 2 * e13 = 2910 | |||
* e15 = 2 * e14 = 5820 | |||
* e16 = e15 + e10 = 6143 | |||
* e17 = 2 * e16 = 12286 | |||
* e18 = e17 + e0 = 12287 | |||
* | |||
* Additions on exponents are converted to Montgomery | |||
* multiplications. We define all intermediate results as so | |||
* many local variables, and let the C compiler work out which | |||
* must be kept around. | |||
*/ | |||
uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; | |||
uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18; | |||
y0 = mq_montymul(y, R2); | |||
y1 = mq_montysqr(y0); | |||
y2 = mq_montymul(y1, y0); | |||
y3 = mq_montymul(y2, y1); | |||
y4 = mq_montysqr(y3); | |||
y5 = mq_montysqr(y4); | |||
y6 = mq_montysqr(y5); | |||
y7 = mq_montysqr(y6); | |||
y8 = mq_montysqr(y7); | |||
y9 = mq_montymul(y8, y2); | |||
y10 = mq_montymul(y9, y8); | |||
y11 = mq_montysqr(y10); | |||
y12 = mq_montysqr(y11); | |||
y13 = mq_montymul(y12, y9); | |||
y14 = mq_montysqr(y13); | |||
y15 = mq_montysqr(y14); | |||
y16 = mq_montymul(y15, y10); | |||
y17 = mq_montysqr(y16); | |||
y18 = mq_montymul(y17, y0); | |||
/* | |||
* Final multiplication with x, which is not in Montgomery | |||
* representation, computes the correct division result. | |||
*/ | |||
return mq_montymul(y18, x); | |||
} | |||
/* | |||
* Compute NTT on a ring element. | |||
*/ | |||
static void | |||
mq_NTT(uint16_t *a, unsigned logn) { | |||
size_t n, t, m; | |||
n = (size_t)1 << logn; | |||
t = n; | |||
for (m = 1; m < n; m <<= 1) { | |||
size_t ht, i, j1; | |||
ht = t >> 1; | |||
for (i = 0, j1 = 0; i < m; i ++, j1 += t) { | |||
size_t j, j2; | |||
uint32_t s; | |||
s = GMb[m + i]; | |||
j2 = j1 + ht; | |||
for (j = j1; j < j2; j ++) { | |||
uint32_t u, v; | |||
u = a[j]; | |||
v = mq_montymul(a[j + ht], s); | |||
a[j] = (uint16_t)mq_add(u, v); | |||
a[j + ht] = (uint16_t)mq_sub(u, v); | |||
} | |||
} | |||
t = ht; | |||
} | |||
} | |||
/* | |||
* Compute the inverse NTT on a ring element, binary case. | |||
*/ | |||
static void | |||
mq_iNTT(uint16_t *a, unsigned logn) { | |||
size_t n, t, m; | |||
uint32_t ni; | |||
n = (size_t)1 << logn; | |||
t = 1; | |||
m = n; | |||
while (m > 1) { | |||
size_t hm, dt, i, j1; | |||
hm = m >> 1; | |||
dt = t << 1; | |||
for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) { | |||
size_t j, j2; | |||
uint32_t s; | |||
j2 = j1 + t; | |||
s = iGMb[hm + i]; | |||
for (j = j1; j < j2; j ++) { | |||
uint32_t u, v, w; | |||
u = a[j]; | |||
v = a[j + t]; | |||
a[j] = (uint16_t)mq_add(u, v); | |||
w = mq_sub(u, v); | |||
a[j + t] = (uint16_t) | |||
mq_montymul(w, s); | |||
} | |||
} | |||
t = dt; | |||
m = hm; | |||
} | |||
/* | |||
* To complete the inverse NTT, we must now divide all values by | |||
* n (the vector size). We thus need the inverse of n, i.e. we | |||
* need to divide 1 by 2 logn times. But we also want it in | |||
* Montgomery representation, i.e. we also want to multiply it | |||
* by R = 2^16. In the common case, this should be a simple right | |||
* shift. The loop below is generic and works also in corner cases; | |||
* its computation time is negligible. | |||
*/ | |||
ni = R; | |||
for (m = n; m > 1; m >>= 1) { | |||
ni = mq_rshift1(ni); | |||
} | |||
for (m = 0; m < n; m ++) { | |||
a[m] = (uint16_t)mq_montymul(a[m], ni); | |||
} | |||
} | |||
/* | |||
* Convert a polynomial (mod q) to Montgomery representation. | |||
*/ | |||
static void | |||
mq_poly_tomonty(uint16_t *f, unsigned logn) { | |||
size_t u, n; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
f[u] = (uint16_t)mq_montymul(f[u], R2); | |||
} | |||
} | |||
/* | |||
* Multiply two polynomials together (NTT representation, and using | |||
* a Montgomery multiplication). Result f*g is written over f. | |||
*/ | |||
static void | |||
mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) { | |||
size_t u, n; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
f[u] = (uint16_t)mq_montymul(f[u], g[u]); | |||
} | |||
} | |||
/* | |||
* Subtract polynomial g from polynomial f. | |||
*/ | |||
static void | |||
mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) { | |||
size_t u, n; | |||
n = (size_t)1 << logn; | |||
for (u = 0; u < n; u ++) { | |||
f[u] = (uint16_t)mq_sub(f[u], g[u]); | |||
} | |||
} | |||
/* ===================================================================== */ | |||
/* see inner.h */ | |||
void | |||
PQCLEAN_FALCON512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn) { | |||
mq_NTT(h, logn); | |||
mq_poly_tomonty(h, logn); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2, | |||
const uint16_t *h, unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
n = (size_t)1 << logn; | |||
tt = (uint16_t *)tmp; | |||
/* | |||
* Reduce s2 elements modulo q ([0..q-1] range). | |||
*/ | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u]; | |||
w += Q & -(w >> 31); | |||
tt[u] = (uint16_t)w; | |||
} | |||
/* | |||
* Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). | |||
*/ | |||
mq_NTT(tt, logn); | |||
mq_poly_montymul_ntt(tt, h, logn); | |||
mq_iNTT(tt, logn); | |||
mq_poly_sub(tt, c0, logn); | |||
/* | |||
* Normalize -s1 elements into the [-q/2..q/2] range. | |||
*/ | |||
for (u = 0; u < n; u ++) { | |||
int32_t w; | |||
w = (int32_t)tt[u]; | |||
w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31)); | |||
((int16_t *)tt)[u] = (int16_t)w; | |||
} | |||
/* | |||
* Signature is valid if and only if the aggregate (-s1,s2) vector | |||
* is short enough. | |||
*/ | |||
return PQCLEAN_FALCON512_AVX2_is_short((int16_t *)tt, s2, logn); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_compute_public(uint16_t *h, | |||
const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
n = (size_t)1 << logn; | |||
tt = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
tt[u] = (uint16_t)mq_conv_small(f[u]); | |||
h[u] = (uint16_t)mq_conv_small(g[u]); | |||
} | |||
mq_NTT(h, logn); | |||
mq_NTT(tt, logn); | |||
for (u = 0; u < n; u ++) { | |||
if (tt[u] == 0) { | |||
return 0; | |||
} | |||
h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); | |||
} | |||
mq_iNTT(h, logn); | |||
return 1; | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_complete_private(int8_t *G, | |||
const int8_t *f, const int8_t *g, const int8_t *F, | |||
unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *t1, *t2; | |||
n = (size_t)1 << logn; | |||
t1 = (uint16_t *)tmp; | |||
t2 = t1 + n; | |||
for (u = 0; u < n; u ++) { | |||
t1[u] = (uint16_t)mq_conv_small(g[u]); | |||
t2[u] = (uint16_t)mq_conv_small(F[u]); | |||
} | |||
mq_NTT(t1, logn); | |||
mq_NTT(t2, logn); | |||
mq_poly_tomonty(t1, logn); | |||
mq_poly_montymul_ntt(t1, t2, logn); | |||
for (u = 0; u < n; u ++) { | |||
t2[u] = (uint16_t)mq_conv_small(f[u]); | |||
} | |||
mq_NTT(t2, logn); | |||
for (u = 0; u < n; u ++) { | |||
if (t2[u] == 0) { | |||
return 0; | |||
} | |||
t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]); | |||
} | |||
mq_iNTT(t1, logn); | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
int32_t gi; | |||
w = t1[u]; | |||
w -= (Q & ~ -((w - (Q >> 1)) >> 31)); | |||
gi = *(int32_t *)&w; | |||
if (gi < -127 || gi > +127) { | |||
return 0; | |||
} | |||
G[u] = (int8_t)gi; | |||
} | |||
return 1; | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_is_invertible( | |||
const int16_t *s2, unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
uint32_t r; | |||
n = (size_t)1 << logn; | |||
tt = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u]; | |||
w += Q & -(w >> 31); | |||
tt[u] = (uint16_t)w; | |||
} | |||
mq_NTT(tt, logn); | |||
r = 0; | |||
for (u = 0; u < n; u ++) { | |||
r |= (uint32_t)(tt[u] - 1); | |||
} | |||
return (int)(1u - (r >> 31)); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_verify_recover(uint16_t *h, | |||
const uint16_t *c0, const int16_t *s1, const int16_t *s2, | |||
unsigned logn, uint8_t *tmp) { | |||
size_t u, n; | |||
uint16_t *tt; | |||
uint32_t r; | |||
n = (size_t)1 << logn; | |||
/* | |||
* Reduce elements of s1 and s2 modulo q; then write s2 into tt[] | |||
* and c0 - s1 into h[]. | |||
*/ | |||
tt = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u]; | |||
w += Q & -(w >> 31); | |||
tt[u] = (uint16_t)w; | |||
w = (uint32_t)s1[u]; | |||
w += Q & -(w >> 31); | |||
w = mq_sub(c0[u], w); | |||
h[u] = (uint16_t)w; | |||
} | |||
/* | |||
* Compute h = (c0 - s1) / s2. If one of the coefficients of s2 | |||
* is zero (in NTT representation) then the operation fails. We | |||
* keep that information into a flag so that we do not deviate | |||
* from strict constant-time processing; if all coefficients of | |||
* s2 are non-zero, then the high bit of r will be zero. | |||
*/ | |||
mq_NTT(tt, logn); | |||
mq_NTT(h, logn); | |||
r = 0; | |||
for (u = 0; u < n; u ++) { | |||
r |= (uint32_t)(tt[u] - 1); | |||
h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); | |||
} | |||
mq_iNTT(h, logn); | |||
/* | |||
* Signature is acceptable if and only if it is short enough, | |||
* and s2 was invertible mod phi mod q. The caller must still | |||
* check that the rebuilt public key matches the expected | |||
* value (e.g. through a hash). | |||
*/ | |||
r = ~r & (uint32_t) - PQCLEAN_FALCON512_AVX2_is_short(s1, s2, logn); | |||
return (int)(r >> 31); | |||
} | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) { | |||
uint16_t *s2; | |||
size_t u, n; | |||
uint32_t r; | |||
n = (size_t)1 << logn; | |||
s2 = (uint16_t *)tmp; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)sig[u]; | |||
w += Q & -(w >> 31); | |||
s2[u] = (uint16_t)w; | |||
} | |||
mq_NTT(s2, logn); | |||
r = 0; | |||
for (u = 0; u < n; u ++) { | |||
uint32_t w; | |||
w = (uint32_t)s2[u] - 1u; | |||
r += (w >> 31); | |||
} | |||
return (int)r; | |||
} |
@@ -1,3 +1,4 @@ | |||
\ | |||
MIT License | |||
Copyright (c) 2017-2019 Falcon Project | |||
@@ -20,3 +21,4 @@ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
@@ -1,10 +1,10 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libfalcon-512_clean.a | |||
LIB=libfalcon512_clean.a | |||
SOURCES = codec.c common.c fft.c fpr.c keygen.c pqclean.c rng.c sign.c vrfy.c | |||
OBJECTS = codec.o common.o fft.o fpr.o keygen.o pqclean.o rng.o sign.o vrfy.o | |||
HEADERS = api.h fpr.h inner.h | |||
SOURCES = codec.c common.c fft.c fpr.c keygen.c pqclean.c rng.c sign.c vrfy.c | |||
OBJECTS = codec.o common.o fft.o fpr.o keygen.o pqclean.o rng.o sign.o vrfy.o | |||
HEADERS = api.h fpr.h inner.h | |||
CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -1,8 +1,8 @@ | |||
# This Makefile can be used with Microsoft Visual Studio's nmake using the command: | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libfalcon-512_clean.lib | |||
OBJECTS=codec.obj common.obj fft.obj fpr.obj keygen.obj pqclean.obj rng.obj sign.obj vrfy.obj | |||
LIBRARY=libfalcon512_clean.lib | |||
OBJECTS=codec.obj common.obj fft.obj fpr.obj keygen.obj pqclean.obj rng.obj sign.obj vrfy.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
@@ -16,7 +16,7 @@ all: $(LIBRARY) | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Encoding/decoding of keys and signatures. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* see inner.h */ | |||
size_t | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Support functions for signatures (hash-to-point, norm). | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* see inner.h */ | |||
void | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* FFT code. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* | |||
* Rules for complex number macros: | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Floating-point operations. | |||
* | |||
@@ -32,7 +34,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* | |||
@@ -1631,4 +1632,3 @@ const fpr fpr_p2_tab[] = { | |||
4571153621781053440U, | |||
4566650022153682944U | |||
}; | |||
@@ -1,3 +1,6 @@ | |||
#ifndef PQCLEAN_FALCON512_CLEAN_FPR_H | |||
#define PQCLEAN_FALCON512_CLEAN_FPR_H | |||
/* | |||
* Floating-point operations. | |||
* | |||
@@ -467,4 +470,4 @@ extern const fpr fpr_gm_tab[]; | |||
extern const fpr fpr_p2_tab[]; | |||
/* ====================================================================== */ | |||
#endif |
@@ -1,5 +1,6 @@ | |||
#ifndef FALCON_INNER_H__ | |||
#define FALCON_INNER_H__ | |||
#ifndef PQCLEAN_FALCON512_CLEAN_INNER_H | |||
#define PQCLEAN_FALCON512_CLEAN_INNER_H | |||
/* | |||
* Internal functions for Falcon. This is not the API intended to be | |||
@@ -72,8 +73,8 @@ | |||
* proper, or integer-based emulation is used, the set_fpu_cw() | |||
* function does nothing, so it can be called systematically. | |||
*/ | |||
#include "fips202.h" | |||
#include "fpr.h" | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
@@ -115,7 +116,6 @@ set_fpu_cw(unsigned x) { | |||
*/ | |||
#include "fips202.h" | |||
#define inner_shake256_context shake256incctx | |||
#define inner_shake256_init(sc) shake256_inc_init(sc) | |||
@@ -438,7 +438,6 @@ int PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h, | |||
* fpr fpr_mtwo63m1 -(2^63-1) | |||
* fpr fpr_ptwo63 2^63 | |||
*/ | |||
#include "fpr.h" | |||
/* ==================================================================== */ | |||
/* | |||
@@ -514,10 +513,6 @@ prng_get_u64(prng *p) { | |||
} | |||
p->ptr = u + 8; | |||
/* | |||
* On systems that use little-endian encoding and allow | |||
* unaligned accesses, we can simply read the data where it is. | |||
*/ | |||
return (uint64_t)p->buf.d[u + 0] | |||
| ((uint64_t)p->buf.d[u + 1] << 8) | |||
| ((uint64_t)p->buf.d[u + 2] << 16) | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon key pair generation. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
#define MKN(logn) ((size_t)1 << (logn)) | |||
@@ -2207,7 +2208,6 @@ get_rng_u64(inner_shake256_context *rng) { | |||
| ((uint64_t)tmp[7] << 56); | |||
} | |||
/* | |||
* Table below incarnates a discrete Gaussian distribution: | |||
* D(x) = exp(-(x^2)/(2*sigma^2)) | |||
@@ -1,16 +1,16 @@ | |||
#include "api.h" | |||
#include "inner.h" | |||
#include "randombytes.h" | |||
#include <stddef.h> | |||
#include <string.h> | |||
/* | |||
* Wrapper for implementing the PQClean API. | |||
*/ | |||
#include <stddef.h> | |||
#include <string.h> | |||
#include "api.h" | |||
#include "inner.h" | |||
#define NONCELEN 40 | |||
#include "randombytes.h" | |||
#define SEEDLEN 48 | |||
/* | |||
* Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024) | |||
@@ -41,8 +41,7 @@ | |||
/* see api.h */ | |||
int | |||
PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair( | |||
uint8_t *pk, uint8_t *sk) { | |||
PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { | |||
union { | |||
uint8_t b[FALCON_KEYGEN_TEMP_9]; | |||
uint64_t dummy_u64; | |||
@@ -50,7 +49,7 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair( | |||
} tmp; | |||
int8_t f[512], g[512], F[512]; | |||
uint16_t h[512]; | |||
unsigned char seed[48]; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context rng; | |||
size_t u, v; | |||
@@ -135,7 +134,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, | |||
int16_t sig[512]; | |||
uint16_t hm[512]; | |||
} r; | |||
unsigned char seed[48]; | |||
unsigned char seed[SEEDLEN]; | |||
inner_shake256_context sc; | |||
size_t u, v; | |||
@@ -279,11 +278,11 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_signature( | |||
const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
/* | |||
* The PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES constant is used for | |||
* the signed message object (as produced by crypto_sign()) | |||
* the signed message object (as produced by PQCLEAN_FALCON512_CLEAN_crypto_sign()) | |||
* and includes a two-byte length value, so we take care here | |||
* to only generate signatures that are two bytes shorter than | |||
* the maximum. This is done to ensure that crypto_sign() | |||
* and crypto_sign_signature() produce the exact same signature | |||
* the maximum. This is done to ensure that PQCLEAN_FALCON512_CLEAN_crypto_sign() | |||
* and PQCLEAN_FALCON512_CLEAN_crypto_sign_signature() produce the exact same signature | |||
* value, if used on the same message, with the same private key, | |||
* and using the same output from randombytes() (this is for | |||
* reproducibility of tests). | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
#include <assert.h> | |||
/* | |||
* PRNG and interface to the system RNG. | |||
* | |||
@@ -29,10 +31,22 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include <assert.h> | |||
#include "inner.h" | |||
/* | |||
* Include relevant system header files. For Win32, this will also need | |||
* linking with advapi32.dll, which we trigger with an appropriate #pragma. | |||
*/ | |||
/* see inner.h */ | |||
int | |||
PQCLEAN_FALCON512_CLEAN_get_seed(void *seed, size_t len) { | |||
(void)seed; | |||
if (len == 0) { | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/* see inner.h */ | |||
void | |||
@@ -46,9 +60,6 @@ PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) { | |||
uint64_t th, tl; | |||
int i; | |||
uint32_t *d32 = (uint32_t *) p->state.d; | |||
uint64_t *d64 = (uint64_t *) p->state.d; | |||
inner_shake256_extract(src, tmp, 56); | |||
for (i = 0; i < 14; i ++) { | |||
uint32_t w; | |||
@@ -57,11 +68,11 @@ PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) { | |||
| ((uint32_t)tmp[(i << 2) + 1] << 8) | |||
| ((uint32_t)tmp[(i << 2) + 2] << 16) | |||
| ((uint32_t)tmp[(i << 2) + 3] << 24); | |||
d32[i] = w; | |||
*(uint32_t *)(p->state.d + (i << 2)) = w; | |||
} | |||
tl = d32[48 / sizeof(uint32_t)]; | |||
th = d32[52 / sizeof(uint32_t)]; | |||
d64[48 / sizeof(uint64_t)] = tl + (th << 32); | |||
tl = *(uint32_t *)(p->state.d + 48); | |||
th = *(uint32_t *)(p->state.d + 52); | |||
*(uint64_t *)(p->state.d + 48) = tl + (th << 32); | |||
PQCLEAN_FALCON512_CLEAN_prng_refill(p); | |||
} | |||
@@ -88,14 +99,12 @@ PQCLEAN_FALCON512_CLEAN_prng_refill(prng *p) { | |||
uint64_t cc; | |||
size_t u; | |||
uint32_t *d32 = (uint32_t *) p->state.d; | |||
uint64_t *d64 = (uint64_t *) p->state.d; | |||
/* | |||
* State uses local endianness. Only the output bytes must be | |||
* converted to little endian (if used on a big-endian machine). | |||
*/ | |||
cc = d64[48 / sizeof(uint64_t)]; | |||
cc = *(uint64_t *)(p->state.d + 48); | |||
for (u = 0; u < 8; u ++) { | |||
uint32_t state[16]; | |||
size_t v; | |||
@@ -139,10 +148,12 @@ PQCLEAN_FALCON512_CLEAN_prng_refill(prng *p) { | |||
state[v] += CW[v]; | |||
} | |||
for (v = 4; v < 14; v ++) { | |||
state[v] += d32[v - 4]; | |||
state[v] += ((uint32_t *)p->state.d)[v - 4]; | |||
} | |||
state[14] += d32[10] ^ (uint32_t)cc; | |||
state[15] += d32[11] ^ (uint32_t)(cc >> 32); | |||
state[14] += ((uint32_t *)p->state.d)[10] | |||
^ (uint32_t)cc; | |||
state[15] += ((uint32_t *)p->state.d)[11] | |||
^ (uint32_t)(cc >> 32); | |||
cc ++; | |||
/* | |||
@@ -160,7 +171,7 @@ PQCLEAN_FALCON512_CLEAN_prng_refill(prng *p) { | |||
(uint8_t)(state[v] >> 24); | |||
} | |||
} | |||
d64[48 / sizeof(uint64_t)] = cc; | |||
*(uint64_t *)(p->state.d + 48) = cc; | |||
p->ptr = 0; | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon signature generation. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* =================================================================== */ | |||
@@ -1081,8 +1082,8 @@ BerExp(prng *p, fpr x, fpr ccs) { | |||
int | |||
PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) { | |||
sampler_context *spc; | |||
int s; | |||
fpr r, dss, ccs; | |||
int s, z0, z, b; | |||
fpr r, dss, ccs, x; | |||
spc = ctx; | |||
@@ -1107,9 +1108,6 @@ PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) { | |||
* We now need to sample on center r. | |||
*/ | |||
for (;;) { | |||
int z0, z, b; | |||
fpr x; | |||
/* | |||
* Sample z for a Gaussian distribution. Then get a | |||
* random bit b to turn the sampling into a bimodal | |||
@@ -1,3 +1,5 @@ | |||
#include "inner.h" | |||
/* | |||
* Falcon signature verification. | |||
* | |||
@@ -29,7 +31,6 @@ | |||
* @author Thomas Pornin <thomas.pornin@nccgroup.com> | |||
*/ | |||
#include "inner.h" | |||
/* ===================================================================== */ | |||
/* | |||
@@ -0,0 +1,33 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon-512 | |||
implementation: clean | |||
files: | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-512 | |||
implementation: avx2 | |||
files: | |||
- fpr.h | |||
- inner.h | |||
- codec.c | |||
- common.c | |||
- fft.c | |||
- fpr.c | |||
- keygen.c | |||
- rng.c | |||
- sign.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-1024 | |||
implementation: clean | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c |
@@ -0,0 +1,32 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon-512 | |||
implementation: clean | |||
files: | |||
- fpr.h | |||
- codec.c | |||
- common.c | |||
- fft.c | |||
- fpr.c | |||
- keygen.c | |||
- rng.c | |||
- sign.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-512 | |||
implementation: avx2 | |||
files: | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-1024 | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c |
@@ -0,0 +1,33 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon-512 | |||
implementation: clean | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-1024 | |||
implementation: clean | |||
files: | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-1024 | |||
implementation: avx2 | |||
files: | |||
- fpr.h | |||
- inner.h | |||
- codec.c | |||
- common.c | |||
- fft.c | |||
- fpr.c | |||
- keygen.c | |||
- rng.c | |||
- sign.c | |||
- vrfy.c |
@@ -0,0 +1,32 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon-512 | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-1024 | |||
implementation: clean | |||
files: | |||
- fpr.h | |||
- codec.c | |||
- common.c | |||
- fft.c | |||
- fpr.c | |||
- keygen.c | |||
- rng.c | |||
- sign.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon-1024 | |||
implementation: avx2 | |||
files: | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- vrfy.c |
@@ -0,0 +1,11 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon1024 | |||
implementation: clean | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c |
@@ -0,0 +1,11 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon1024 | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c |
@@ -0,0 +1,33 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon512 | |||
implementation: clean | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon1024 | |||
implementation: clean | |||
files: | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon1024 | |||
implementation: avx2 | |||
files: | |||
- fpr.h | |||
- inner.h | |||
- codec.c | |||
- common.c | |||
- fft.c | |||
- fpr.c | |||
- keygen.c | |||
- rng.c | |||
- sign.c | |||
- vrfy.c |
@@ -0,0 +1,32 @@ | |||
consistency_checks: | |||
- source: | |||
scheme: falcon512 | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- pqclean.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon1024 | |||
implementation: clean | |||
files: | |||
- fpr.h | |||
- codec.c | |||
- common.c | |||
- fft.c | |||
- fpr.c | |||
- keygen.c | |||
- rng.c | |||
- sign.c | |||
- vrfy.c | |||
- source: | |||
scheme: falcon1024 | |||
implementation: avx2 | |||
files: | |||
- codec.c | |||
- common.c | |||
- keygen.c | |||
- vrfy.c |