@@ -0,0 +1,31 @@ | |||
name: Dilithium2 | |||
type: signature | |||
claimed-nist-level: 2 | |||
length-public-key: 1312 | |||
length-secret-key: 2544 | |||
length-signature: 2420 | |||
nistkat-sha256: 9c636528bf81c03df6ad8f9471cb1b4d9097d66af825d4f60b7ff0d941ca4d37 | |||
testvectors-sha256: 166fc2481358d5a1b7a528b30af36ad069b049b5755cf63b843ce0f25f35aeb6 | |||
principal-submitters: | |||
- Vadim Lyubashevsky | |||
auxiliary-submitters: | |||
- Léo Ducas | |||
- Eike Kiltz | |||
- Tancrède Lepoint | |||
- Peter Schwabe | |||
- Gregor Seiler | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- aes | |||
- avx2 | |||
- popcnt |
@@ -0,0 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGN_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[N]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGNED_INT32(N) \ | |||
union { \ | |||
int32_t coeffs[N]; \ | |||
__m256i vec[((N)+7)/8]; \ | |||
} | |||
#endif |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_API_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1312 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2544 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2420 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,24 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_CDECL_H | |||
#define _8XQ 0 | |||
#define _8XQINV 8 | |||
#define _8XDIV_QINV 16 | |||
#define _8XDIV 24 | |||
#define _ZETAS_QINV 32 | |||
#define _ZETAS 328 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
* C files (functions) can't be found, and all symbols we | |||
* refer to from ASM also can't be found (nttconsts.c). | |||
* | |||
* This define helps us get around this | |||
*/ | |||
#define _cdecl(s) _##s | |||
#define cdecl(s) s | |||
#endif |
@@ -0,0 +1,101 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
#define MONT (-4186625) // 2^32 mod q | |||
#define DIV 41978 // mont^2/256 | |||
#define DIV_QINV (-8395782) | |||
const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata = {{ | |||
//#define _8XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, | |||
//#define _8XQINV 8 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
//#define _8XDIV_QINV 16 | |||
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, | |||
//#define _8XDIV 24 | |||
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, | |||
//#define _ZETAS_QINV 32 | |||
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, | |||
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, | |||
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, | |||
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, | |||
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, | |||
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, | |||
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, | |||
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, | |||
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, | |||
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, | |||
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, | |||
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, | |||
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, | |||
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, | |||
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, | |||
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, | |||
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, | |||
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, | |||
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, | |||
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, | |||
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, | |||
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, | |||
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, | |||
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, | |||
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, | |||
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, | |||
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, | |||
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, | |||
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, | |||
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, | |||
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, | |||
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, | |||
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, | |||
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, | |||
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, | |||
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, | |||
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, | |||
//#define _ZETAS 328 | |||
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, | |||
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, | |||
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, | |||
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, | |||
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, | |||
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, | |||
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, | |||
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, | |||
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, | |||
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, | |||
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, | |||
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, | |||
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, | |||
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, | |||
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, | |||
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, | |||
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, | |||
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, | |||
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, | |||
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, | |||
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, | |||
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, | |||
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, | |||
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, | |||
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, | |||
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, | |||
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, | |||
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, | |||
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, | |||
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, | |||
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, | |||
} | |||
}; |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CONSTS_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_CONSTS_H | |||
#include "align.h" | |||
#include "cdecl.h" | |||
typedef ALIGNED_INT32(624) qdata_t; | |||
extern const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata; | |||
#endif |
@@ -0,0 +1,909 @@ | |||
/* Taken from Bas Westerbaan's new 4-way SHAKE implementation | |||
* for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), | |||
* but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ | |||
#include "cdecl.h" | |||
.data | |||
.p2align 5 | |||
rho8: | |||
.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 | |||
rho56: | |||
.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4): | |||
vmovdqa rho8(%rip), %ymm0 | |||
movq $6, %rax | |||
looptop: | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 192(%rdi), %ymm4, %ymm9 | |||
vpxor 384(%rdi), %ymm3, %ymm10 | |||
vpxor 576(%rdi), %ymm2, %ymm11 | |||
vpxor 768(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 0(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 96(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 320(%rdi), %ymm5, %ymm10 | |||
vpxor 512(%rdi), %ymm4, %ymm11 | |||
vpxor 704(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 32(%rdi), %ymm4, %ymm8 | |||
vpxor 224(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 608(%rdi), %ymm1, %ymm11 | |||
vpxor 640(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 128(%rdi), %ymm1, %ymm8 | |||
vpxor 160(%rdi), %ymm5, %ymm9 | |||
vpxor 352(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 736(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 64(%rdi), %ymm3, %ymm8 | |||
vpxor 256(%rdi), %ymm2, %ymm9 | |||
vpxor 448(%rdi), %ymm1, %ymm10 | |||
vpxor 480(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 512(%rdi), %ymm4, %ymm9 | |||
vpxor 224(%rdi), %ymm3, %ymm10 | |||
vpxor 736(%rdi), %ymm2, %ymm11 | |||
vpxor 448(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 8(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 576(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 640(%rdi), %ymm5, %ymm10 | |||
vpxor 352(%rdi), %ymm4, %ymm11 | |||
vpxor 64(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 192(%rdi), %ymm4, %ymm8 | |||
vpxor 704(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 128(%rdi), %ymm1, %ymm11 | |||
vpxor 480(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 768(%rdi), %ymm1, %ymm8 | |||
vpxor 320(%rdi), %ymm5, %ymm9 | |||
vpxor 32(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 256(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 384(%rdi), %ymm3, %ymm8 | |||
vpxor 96(%rdi), %ymm2, %ymm9 | |||
vpxor 608(%rdi), %ymm1, %ymm10 | |||
vpxor 160(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 352(%rdi), %ymm4, %ymm9 | |||
vpxor 704(%rdi), %ymm3, %ymm10 | |||
vpxor 256(%rdi), %ymm2, %ymm11 | |||
vpxor 608(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 16(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 736(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 480(%rdi), %ymm5, %ymm10 | |||
vpxor 32(%rdi), %ymm4, %ymm11 | |||
vpxor 384(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 512(%rdi), %ymm4, %ymm8 | |||
vpxor 64(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 768(%rdi), %ymm1, %ymm11 | |||
vpxor 160(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 448(%rdi), %ymm1, %ymm8 | |||
vpxor 640(%rdi), %ymm5, %ymm9 | |||
vpxor 192(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 96(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 224(%rdi), %ymm3, %ymm8 | |||
vpxor 576(%rdi), %ymm2, %ymm9 | |||
vpxor 128(%rdi), %ymm1, %ymm10 | |||
vpxor 320(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 32(%rdi), %ymm4, %ymm9 | |||
vpxor 64(%rdi), %ymm3, %ymm10 | |||
vpxor 96(%rdi), %ymm2, %ymm11 | |||
vpxor 128(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 24(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 256(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 160(%rdi), %ymm5, %ymm10 | |||
vpxor 192(%rdi), %ymm4, %ymm11 | |||
vpxor 224(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 352(%rdi), %ymm4, %ymm8 | |||
vpxor 384(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 448(%rdi), %ymm1, %ymm11 | |||
vpxor 320(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 608(%rdi), %ymm1, %ymm8 | |||
vpxor 480(%rdi), %ymm5, %ymm9 | |||
vpxor 512(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 576(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 704(%rdi), %ymm3, %ymm8 | |||
vpxor 736(%rdi), %ymm2, %ymm9 | |||
vpxor 768(%rdi), %ymm1, %ymm10 | |||
vpxor 640(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
addq $32, %rsi | |||
subq $1, %rax | |||
jnz looptop | |||
ret |
@@ -0,0 +1,219 @@ | |||
#include "fips202.h" | |||
#include "fips202x4.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define NROUNDS 24 | |||
/* Keccak round constants */ | |||
static const uint64_t KeccakF_RoundConstants[NROUNDS] = { | |||
(uint64_t)0x0000000000000001ULL, | |||
(uint64_t)0x0000000000008082ULL, | |||
(uint64_t)0x800000000000808aULL, | |||
(uint64_t)0x8000000080008000ULL, | |||
(uint64_t)0x000000000000808bULL, | |||
(uint64_t)0x0000000080000001ULL, | |||
(uint64_t)0x8000000080008081ULL, | |||
(uint64_t)0x8000000000008009ULL, | |||
(uint64_t)0x000000000000008aULL, | |||
(uint64_t)0x0000000000000088ULL, | |||
(uint64_t)0x0000000080008009ULL, | |||
(uint64_t)0x000000008000000aULL, | |||
(uint64_t)0x000000008000808bULL, | |||
(uint64_t)0x800000000000008bULL, | |||
(uint64_t)0x8000000000008089ULL, | |||
(uint64_t)0x8000000000008003ULL, | |||
(uint64_t)0x8000000000008002ULL, | |||
(uint64_t)0x8000000000000080ULL, | |||
(uint64_t)0x000000000000800aULL, | |||
(uint64_t)0x800000008000000aULL, | |||
(uint64_t)0x8000000080008081ULL, | |||
(uint64_t)0x8000000000008080ULL, | |||
(uint64_t)0x0000000080000001ULL, | |||
(uint64_t)0x8000000080008008ULL | |||
}; | |||
static void keccakx4_absorb_once(__m256i s[25], | |||
unsigned int r, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen, | |||
uint8_t p) { | |||
size_t i; | |||
uint64_t pos = 0; | |||
__m256i t, idx; | |||
for (i = 0; i < 25; ++i) { | |||
s[i] = _mm256_setzero_si256(); | |||
} | |||
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); | |||
while (inlen >= r) { | |||
for (i = 0; i < r / 8; ++i) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= r; | |||
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); | |||
} | |||
for (i = 0; i < inlen / 8; ++i) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= 8 * i; | |||
if (inlen) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); | |||
t = _mm256_and_si256(t, idx); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
} | |||
t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
t = _mm256_set1_epi64x((long long)(1ULL << 63)); | |||
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); | |||
} | |||
static void keccakx4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
unsigned int r, | |||
__m256i s[25]) { | |||
unsigned int i; | |||
__m128d t; | |||
while (nblocks > 0) { | |||
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); | |||
for (i = 0; i < r / 8; ++i) { | |||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | |||
_mm_storel_pd((double *)&out0[8 * i], t); | |||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | |||
_mm_storel_pd((double *)&out2[8 * i], t); | |||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||
} | |||
out0 += r; | |||
out1 += r; | |||
out2 += r; | |||
out3 += r; | |||
--nblocks; | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
unsigned int i; | |||
size_t nblocks = outlen / SHAKE128_RATE; | |||
uint8_t t[4][SHAKE128_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE128_RATE; | |||
out1 += nblocks * SHAKE128_RATE; | |||
out2 += nblocks * SHAKE128_RATE; | |||
out3 += nblocks * SHAKE128_RATE; | |||
outlen -= nblocks * SHAKE128_RATE; | |||
if (outlen) { | |||
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
unsigned int i; | |||
size_t nblocks = outlen / SHAKE256_RATE; | |||
uint8_t t[4][SHAKE256_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE256_RATE; | |||
out1 += nblocks * SHAKE256_RATE; | |||
out2 += nblocks * SHAKE256_RATE; | |||
out3 += nblocks * SHAKE256_RATE; | |||
outlen -= nblocks * SHAKE256_RATE; | |||
if (outlen) { | |||
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} |
@@ -0,0 +1,64 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
typedef struct { | |||
__m256i s[25]; | |||
} keccakx4_state; | |||
void PQCLEAN_DILITHIUM2_AVX2_f1600x4(__m256i *s, const uint64_t *rc); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
#endif |
@@ -0,0 +1,240 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpsubd %ymm\l,%ymm\h,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vpmuldq %ymm\zl0,%ymm12,%ymm13 | |||
vmovshdup %ymm12,%ymm\h | |||
vpmuldq %ymm\zl1,%ymm\h,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm12,%ymm12 | |||
vpmuldq %ymm\zh1,%ymm\h,%ymm\h | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpsubd %ymm13,%ymm12,%ymm12 | |||
vpsubd %ymm14,%ymm\h,%ymm\h | |||
vmovshdup %ymm12,%ymm12 | |||
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h | |||
.endm | |||
.macro levels0t5 off | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
/* level 0 */ | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,5,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 6,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,9,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 10,11,1,3,2,15 | |||
/* level 1 */ | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,6,1,3,2,15 | |||
butterfly 5,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,10,1,3,2,15 | |||
butterfly 9,11,1,3,2,15 | |||
/* level 2 */ | |||
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,8,1,3,2,15 | |||
butterfly 5,9,1,3,2,15 | |||
butterfly 6,10,1,3,2,15 | |||
butterfly 7,11,1,3,2,15 | |||
/* level 3 */ | |||
shuffle2 4,5,3,5 | |||
shuffle2 6,7,4,7 | |||
shuffle2 8,9,6,9 | |||
shuffle2 10,11,8,11 | |||
vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 4,7 | |||
butterfly 6,9 | |||
butterfly 8,11 | |||
/* level 4 */ | |||
shuffle4 3,4,10,4 | |||
shuffle4 6,8,3,8 | |||
shuffle4 5,7,6,7 | |||
shuffle4 9,11,5,11 | |||
vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 10,4 | |||
butterfly 3,8 | |||
butterfly 6,7 | |||
butterfly 5,11 | |||
/* level 5 */ | |||
shuffle8 10,3,9,3 | |||
shuffle8 6,5,10,5 | |||
shuffle8 4,8,6,8 | |||
shuffle8 7,11,4,11 | |||
vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 | |||
butterfly 9,3 | |||
butterfly 10,5 | |||
butterfly 6,8 | |||
butterfly 4,11 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm10,256*\off+ 32(%rdi) | |||
vmovdqa %ymm6,256*\off+ 64(%rdi) | |||
vmovdqa %ymm4,256*\off+ 96(%rdi) | |||
vmovdqa %ymm3,256*\off+128(%rdi) | |||
vmovdqa %ymm5,256*\off+160(%rdi) | |||
vmovdqa %ymm8,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
.macro levels6t7 off | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
/* level 6 */ | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
/* level 7 */ | |||
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 | |||
vmovdqa (_8XDIV)*4(%rsi),%ymm2 | |||
vpmuldq %ymm1,%ymm4,%ymm12 | |||
vpmuldq %ymm1,%ymm5,%ymm13 | |||
vmovshdup %ymm4,%ymm8 | |||
vmovshdup %ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm4,%ymm4 | |||
vpmuldq %ymm2,%ymm5,%ymm5 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm4,%ymm4 | |||
vpsubd %ymm13,%ymm5,%ymm5 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm4,%ymm4 | |||
vmovshdup %ymm5,%ymm5 | |||
vpblendd $0xAA,%ymm8,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm9,%ymm5,%ymm5 | |||
vpmuldq %ymm1,%ymm6,%ymm12 | |||
vpmuldq %ymm1,%ymm7,%ymm13 | |||
vmovshdup %ymm6,%ymm8 | |||
vmovshdup %ymm7,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm6,%ymm6 | |||
vpmuldq %ymm2,%ymm7,%ymm7 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm6,%ymm6 | |||
vpsubd %ymm13,%ymm7,%ymm7 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm6,%ymm6 | |||
vmovshdup %ymm7,%ymm7 | |||
vpblendd $0xAA,%ymm8,%ymm6,%ymm6 | |||
vpblendd $0xAA,%ymm9,%ymm7,%ymm7 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t5 0 | |||
levels0t5 1 | |||
levels0t5 2 | |||
levels0t5 3 | |||
levels6t7 0 | |||
levels6t7 1 | |||
levels6t7 2 | |||
levels6t7 3 | |||
ret |
@@ -0,0 +1,199 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpmuldq %ymm\zl0,%ymm\h,%ymm13 | |||
vmovshdup %ymm\h,%ymm12 | |||
vpmuldq %ymm\zl1,%ymm12,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm\h,%ymm\h | |||
vpmuldq %ymm\zh1,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vmovshdup %ymm\h,%ymm\h | |||
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h | |||
vpsubd %ymm\h,%ymm\l,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vmovshdup %ymm13,%ymm13 | |||
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 | |||
vpaddd %ymm13,%ymm12,%ymm\h | |||
vpsubd %ymm13,%ymm\l,%ymm\l | |||
.endm | |||
.macro levels0t1 off | |||
/* level 0 */ | |||
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
/* level 1 */ | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
.endm | |||
.macro levels2t7 off | |||
/* level 2 */ | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
/* level 3 */ | |||
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 8,10 | |||
butterfly 4,6 | |||
butterfly 9,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
/* level 4 */ | |||
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 | |||
butterfly 7,8 | |||
butterfly 5,6 | |||
butterfly 3,4 | |||
butterfly 10,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
/* level 5 */ | |||
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,5,1,10,2,15 | |||
butterfly 8,4,1,10,2,15 | |||
butterfly 7,3,1,10,2,15 | |||
butterfly 6,11,1,10,2,15 | |||
/* level 6 */ | |||
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,7,1,10,2,15 | |||
butterfly 8,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,3,1,10,2,15 | |||
butterfly 4,11,1,10,2,15 | |||
/* level 7 */ | |||
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,8,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 7,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,4,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 3,11,1,10,2,15 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm8,256*\off+ 32(%rdi) | |||
vmovdqa %ymm7,256*\off+ 64(%rdi) | |||
vmovdqa %ymm6,256*\off+ 96(%rdi) | |||
vmovdqa %ymm5,256*\off+128(%rdi) | |||
vmovdqa %ymm4,256*\off+160(%rdi) | |||
vmovdqa %ymm3,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t1 0 | |||
levels0t1 1 | |||
levels0t1 2 | |||
levels0t1 3 | |||
levels2t7 0 | |||
levels2t7 1 | |||
levels2t7 2 | |||
levels2t7 3 | |||
ret | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTT_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_NTT_H | |||
#include <immintrin.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx(__m256i *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
#endif |
@@ -0,0 +1,261 @@ | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk | |||
* | |||
* Description: Bit-pack public key pk = (rho, t1). | |||
* | |||
* Arguments: - uint8_t pk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_pk | |||
* | |||
* Description: Unpack public key pk = (rho, t1). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { | |||
return 1; | |||
} | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
} | |||
h->vec[i].coeffs[sig[j]] = 1; | |||
} | |||
k = sig[OMEGA + i]; | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_PACKING_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]); | |||
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]); | |||
#endif |
@@ -0,0 +1,41 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_PARAMS_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 4 | |||
#define L 4 | |||
#define ETA 2 | |||
#define TAU 39 | |||
#define BETA 78 | |||
#define GAMMA1 (1 << 17) | |||
#define GAMMA2 ((Q-1)/88) | |||
#define OMEGA 80 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 576 | |||
#define POLYW1_PACKEDBYTES 192 | |||
#define POLYETA_PACKEDBYTES 96 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -0,0 +1,199 @@ | |||
#include "params.h" | |||
#include "cdecl.h" | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
#load | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
vmovdqa 64(%rsi),%ymm6 | |||
vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vmovdqa 64(%rdx),%ymm14 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vmovshdup %ymm6,%ymm7 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
vpsrlq $32,%ymm12,%ymm13 | |||
vmovshdup %ymm14,%ymm15 | |||
#mul | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
vpmuldq %ymm6,%ymm14,%ymm6 | |||
vpmuldq %ymm7,%ymm15,%ymm7 | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm0,%ymm6,%ymm14 | |||
vpmuldq %ymm0,%ymm7,%ymm15 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpmuldq %ymm1,%ymm14,%ymm14 | |||
vpmuldq %ymm1,%ymm15,%ymm15 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsubq %ymm14,%ymm6,%ymm6 | |||
vpsubq %ymm15,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vmovshdup %ymm6,%ymm6 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
vpblendd $0xAA,%ymm5,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm7,%ymm6,%ymm6 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
add $96,%rdi | |||
add $96,%rsi | |||
add $96,%rdx | |||
add $1,%eax | |||
cmp $10,%eax | |||
jb _looptop1 | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0x55,%ymm2,%ymm3,%ymm2 | |||
vpblendd $0x55,%ymm4,%ymm5,%ymm4 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
ret | |||
.macro pointwise off | |||
#load | |||
vmovdqa \off(%rsi),%ymm6 | |||
vmovdqa \off+32(%rsi),%ymm8 | |||
vmovdqa \off(%rdx),%ymm10 | |||
vmovdqa \off+32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm6,%ymm7 | |||
vpsrlq $32,%ymm8,%ymm9 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuldq %ymm6,%ymm10,%ymm6 | |||
vpmuldq %ymm7,%ymm11,%ymm7 | |||
vpmuldq %ymm8,%ymm12,%ymm8 | |||
vpmuldq %ymm9,%ymm13,%ymm9 | |||
.endm | |||
.macro acc | |||
vpaddq %ymm6,%ymm2,%ymm2 | |||
vpaddq %ymm7,%ymm3,%ymm3 | |||
vpaddq %ymm8,%ymm4,%ymm4 | |||
vpaddq %ymm9,%ymm5,%ymm5 | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
pointwise 0 | |||
#mov | |||
vmovdqa %ymm6,%ymm2 | |||
vmovdqa %ymm7,%ymm3 | |||
vmovdqa %ymm8,%ymm4 | |||
vmovdqa %ymm9,%ymm5 | |||
pointwise 1024 | |||
acc | |||
pointwise 2048 | |||
acc | |||
pointwise 3072 | |||
acc | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm6 | |||
vpmuldq %ymm0,%ymm3,%ymm7 | |||
vpmuldq %ymm0,%ymm4,%ymm8 | |||
vpmuldq %ymm0,%ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm6,%ymm6 | |||
vpmuldq %ymm1,%ymm7,%ymm7 | |||
vpmuldq %ymm1,%ymm8,%ymm8 | |||
vpmuldq %ymm1,%ymm9,%ymm9 | |||
vpsubq %ymm6,%ymm2,%ymm2 | |||
vpsubq %ymm7,%ymm3,%ymm3 | |||
vpsubq %ymm8,%ymm4,%ymm4 | |||
vpsubq %ymm9,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
vpblendd $0xAA,%ymm5,%ymm4,%ymm4 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
add $64,%rsi | |||
add $64,%rdx | |||
add $64,%rdi | |||
add $1,%eax | |||
cmp $16,%eax | |||
jb _looptop2 | |||
ret |
@@ -0,0 +1,79 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLY_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_POLY_H | |||
#include "align.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
typedef ALIGNED_INT32(N) poly; | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); | |||
#endif |
@@ -0,0 +1,474 @@ | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
#define UNUSED(x) (void)x | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(&mat[0], NULL, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(&mat[1], NULL, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(&mat[2], NULL, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 256, 257, 258, 259); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 512, 513, 514, 515); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 768, 769, 770, 771); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length L | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_add | |||
* | |||
* Description: Add vectors of polynomials of length L. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyvecl *w: pointer to output vector | |||
* - const polyvecl *u: pointer to first summand | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length L. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { | |||
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_add | |||
* | |||
* Description: Add vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first summand | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
* - const polyveck *v: pointer to second input vector to be | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length K. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
* than 2*Q. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint | |||
* | |||
* Description: Compute hint vector. | |||
* | |||
* Arguments: - uint8_t *hint: pointer to output hint array | |||
* - const polyveck *v0: pointer to low part of input vector | |||
* - const polyveck *v1: pointer to high part of input vector | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { | |||
unsigned int i, n = 0; | |||
for (i = 0; i < K; ++i) { | |||
n += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint | |||
* | |||
* Description: Use hint vector to correct the high bits of input vector. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -0,0 +1,72 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
/* Vectors of polynomials of length K */ | |||
typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -0,0 +1,408 @@ | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8] = { | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 0, 0, 0, 0, 0}, | |||
{ 3, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 0, 0, 0, 0, 0}, | |||
{ 2, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 0, 0, 0, 0}, | |||
{ 4, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 0, 0, 0, 0}, | |||
{ 3, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 0, 0, 0, 0}, | |||
{ 2, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 0, 0, 0}, | |||
{ 5, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 0, 0, 0, 0, 0}, | |||
{ 2, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 0, 0, 0, 0}, | |||
{ 3, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 0, 0, 0, 0}, | |||
{ 2, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 0, 0, 0}, | |||
{ 4, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 0, 0, 0, 0}, | |||
{ 2, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 0, 0, 0}, | |||
{ 3, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 0, 0, 0}, | |||
{ 2, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 0, 0}, | |||
{ 6, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 6, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 6, 0, 0, 0, 0}, | |||
{ 3, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 6, 0, 0, 0, 0}, | |||
{ 2, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 6, 0, 0, 0}, | |||
{ 4, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 6, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 6, 0, 0, 0}, | |||
{ 3, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 6, 0, 0, 0}, | |||
{ 2, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 6, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 6, 0, 0}, | |||
{ 5, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 6, 0, 0, 0, 0}, | |||
{ 2, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 6, 0, 0, 0}, | |||
{ 3, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 6, 0, 0, 0}, | |||
{ 2, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 6, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 6, 0, 0}, | |||
{ 4, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 6, 0, 0, 0}, | |||
{ 2, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 6, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 6, 0, 0}, | |||
{ 3, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 6, 0, 0}, | |||
{ 2, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 6, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 6, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 6, 0}, | |||
{ 7, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 7, 0, 0, 0, 0, 0}, | |||
{ 2, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 7, 0, 0, 0, 0}, | |||
{ 3, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 7, 0, 0, 0, 0}, | |||
{ 2, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 7, 0, 0, 0}, | |||
{ 4, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 7, 0, 0, 0, 0}, | |||
{ 2, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 7, 0, 0, 0}, | |||
{ 3, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 7, 0, 0, 0}, | |||
{ 2, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 7, 0, 0}, | |||
{ 5, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 7, 0, 0, 0, 0}, | |||
{ 2, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 7, 0, 0, 0}, | |||
{ 3, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 7, 0, 0, 0}, | |||
{ 2, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 7, 0, 0}, | |||
{ 4, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 7, 0, 0, 0}, | |||
{ 2, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 7, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 7, 0, 0}, | |||
{ 3, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 7, 0, 0}, | |||
{ 2, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 7, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 7, 0}, | |||
{ 6, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 6, 7, 0, 0, 0, 0}, | |||
{ 2, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 6, 7, 0, 0, 0}, | |||
{ 3, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 6, 7, 0, 0, 0}, | |||
{ 2, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 6, 7, 0, 0}, | |||
{ 4, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 6, 7, 0, 0, 0}, | |||
{ 2, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 6, 7, 0, 0}, | |||
{ 3, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 1, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 6, 7, 0, 0}, | |||
{ 2, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 6, 7, 0, 0}, | |||
{ 1, 2, 3, 4, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 6, 7, 0}, | |||
{ 5, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 6, 7, 0, 0, 0}, | |||
{ 2, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 6, 7, 0, 0}, | |||
{ 3, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 6, 7, 0, 0}, | |||
{ 2, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 6, 7, 0, 0}, | |||
{ 1, 2, 3, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 6, 7, 0}, | |||
{ 4, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 6, 7, 0, 0}, | |||
{ 2, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 6, 7, 0, 0}, | |||
{ 1, 2, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 6, 7, 0}, | |||
{ 3, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 1, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 6, 7, 0}, | |||
{ 2, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 6, 7, 0}, | |||
{ 1, 2, 3, 4, 5, 6, 7, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 6, 7} | |||
}; | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i d, tmp; | |||
const __m256i bound = _mm256_set1_epi32(Q); | |||
const __m256i mask = _mm256_set1_epi32(0x7FFFFF); | |||
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, | |||
-1, 9, 8, 7, -1, 6, 5, 4, | |||
-1, 11, 10, 9, -1, 8, 7, 6, | |||
-1, 5, 4, 3, -1, 2, 1, 0); | |||
ctr = pos = 0; | |||
while (pos <= REJ_UNIFORM_BUFLEN - 24) { | |||
d = _mm256_loadu_si256((__m256i *)&buf[pos]); | |||
d = _mm256_permute4x64_epi64(d, 0x94); | |||
d = _mm256_shuffle_epi8(d, idx8); | |||
d = _mm256_and_si256(d, mask); | |||
pos += 24; | |||
tmp = _mm256_sub_epi32(d, bound); | |||
good = _mm256_movemask_ps((__m256)tmp); | |||
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good])); | |||
d = _mm256_permutevar8x32_epi32(d, tmp); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], d); | |||
ctr += _mm_popcnt_u32(good); | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
} | |||
uint32_t t; | |||
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) { | |||
t = buf[pos++]; | |||
t |= (uint32_t)buf[pos++] << 8; | |||
t |= (uint32_t)buf[pos++] << 16; | |||
t &= 0x7FFFFF; | |||
if (t < Q) { | |||
r[ctr++] = t; | |||
} | |||
} | |||
return ctr; | |||
} | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i f0, f1, f2; | |||
__m128i g0, g1; | |||
const __m256i mask = _mm256_set1_epi8(15); | |||
const __m256i eta = _mm256_set1_epi8(ETA); | |||
const __m256i bound = mask; | |||
const __m256i v = _mm256_set1_epi32(-6560); | |||
const __m256i p = _mm256_set1_epi32(5); | |||
ctr = pos = 0; | |||
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { | |||
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); | |||
f1 = _mm256_slli_epi16(f0, 4); | |||
f0 = _mm256_or_si256(f0, f1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_sub_epi8(f0, bound); | |||
f0 = _mm256_sub_epi8(eta, f0); | |||
good = _mm256_movemask_epi8(f1); | |||
g0 = _mm256_castsi256_si128(f0); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm256_extracti128_si256(f0, 1); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 4; | |||
} | |||
uint32_t t0, t1; | |||
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (t0 < 15) { | |||
t0 = t0 - (205 * t0 >> 10) * 5; | |||
r[ctr++] = 2 - t0; | |||
} | |||
if (t1 < 15 && ctr < N) { | |||
t1 = t1 - (205 * t1 >> 10) * 5; | |||
r[ctr++] = 2 - t1; | |||
} | |||
} | |||
return ctr; | |||
} |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) | |||
extern const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8]; | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); | |||
#endif |
@@ -0,0 +1,157 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "rounding.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define _mm256_blendv_epi32(a,b,mask) \ | |||
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ | |||
_mm256_castsi256_ps(b), \ | |||
_mm256_castsi256_ps(mask))) | |||
/************************************************* | |||
* Name: power2round | |||
* | |||
* Description: For finite field elements a, compute a0, a1 such that | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be positive standard representative. | |||
* | |||
* Arguments: - __m256i *a1: output array of length N/8 with high bits | |||
* - __m256i *a0: output array of length N/8 with low bits a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1; | |||
const __m256i mask = _mm256_set1_epi32(-(1 << D)); | |||
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); | |||
for (i = 0; i < N / 8; ++i) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, half); | |||
f0 = _mm256_and_si256(f1, mask); | |||
f1 = _mm256_srli_epi32(f1, D); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: decompose | |||
* | |||
* Description: For finite field element a, compute high and low parts a0, a1 such | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard | |||
* representative. | |||
* | |||
* Arguments: - __m256i *a1: output array of length N/8 with high parts | |||
* - __m256i *a0: output array of length N/8 with low parts a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1, t; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]); | |||
const __m256i hq = _mm256_srli_epi32(q, 1); | |||
const __m256i v = _mm256_set1_epi32(11275); | |||
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); | |||
const __m256i off = _mm256_set1_epi32(127); | |||
const __m256i shift = _mm256_set1_epi32(128); | |||
const __m256i max = _mm256_set1_epi32(43); | |||
const __m256i zero = _mm256_setzero_si256(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, off); | |||
f1 = _mm256_srli_epi32(f1, 7); | |||
f1 = _mm256_mulhi_epu16(f1, v); | |||
f1 = _mm256_mulhrs_epi16(f1, shift); | |||
t = _mm256_sub_epi32(max, f1); | |||
f1 = _mm256_blendv_epi32(f1, zero, t); | |||
f0 = _mm256_mullo_epi32(f1, alpha); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
f = _mm256_cmpgt_epi32(f0, hq); | |||
f = _mm256_and_si256(f, q); | |||
f0 = _mm256_sub_epi32(f0, f); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: make_hint | |||
* | |||
* Description: Compute indices of polynomial coefficients whose low bits | |||
* overflow into the high bits. | |||
* | |||
* Arguments: - uint8_t *hint: hint array | |||
* - const __m256i *a0: low bits of input elements | |||
* - const __m256i *a1: high bits of input elements | |||
* | |||
* Returns number of overflowing low bits | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { | |||
unsigned int i, n = 0; | |||
__m256i f0, f1, g0, g1; | |||
uint32_t bad; | |||
uint64_t idx; | |||
const __m256i low = _mm256_set1_epi32(-GAMMA2); | |||
const __m256i high = _mm256_set1_epi32(GAMMA2); | |||
for (i = 0; i < N / 8; ++i) { | |||
f0 = _mm256_load_si256(&a0[i]); | |||
f1 = _mm256_load_si256(&a1[i]); | |||
g0 = _mm256_abs_epi32(f0); | |||
g0 = _mm256_cmpgt_epi32(g0, high); | |||
g1 = _mm256_cmpeq_epi32(f0, low); | |||
g1 = _mm256_sign_epi32(g1, f1); | |||
g0 = _mm256_or_si256(g0, g1); | |||
bad = _mm256_movemask_ps((__m256)g0); | |||
memcpy(&idx, PQCLEAN_DILITHIUM2_AVX2_idxlut[bad], 8); | |||
idx += (uint64_t)0x0808080808080808 * i; | |||
memcpy(&hint[n], &idx, 8); | |||
n += _mm_popcnt_u32(bad); | |||
} | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: use_hint | |||
* | |||
* Description: Correct high parts according to hint. | |||
* | |||
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts | |||
* - const __m256i *a: input array of length N/8 | |||
* - const __m256i *a: input array of length N/8 with hint bits | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { | |||
unsigned int i; | |||
__m256i a0[N / 8]; | |||
__m256i f, g, h, t; | |||
const __m256i zero = _mm256_setzero_si256(); | |||
const __m256i max = _mm256_set1_epi32(43); | |||
PQCLEAN_DILITHIUM2_AVX2_decompose_avx(b, a0, a); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a0[i]); | |||
g = _mm256_load_si256(&b[i]); | |||
h = _mm256_load_si256(&hint[i]); | |||
t = _mm256_blendv_epi32(zero, h, f); | |||
t = _mm256_slli_epi32(t, 1); | |||
h = _mm256_sub_epi32(h, t); | |||
g = _mm256_add_epi32(g, h); | |||
g = _mm256_blendv_epi32(g, max, g); | |||
f = _mm256_cmpgt_epi32(g, max); | |||
g = _mm256_blendv_epi32(g, zero, f); | |||
_mm256_store_si256(&b[i], g); | |||
} | |||
} |
@@ -0,0 +1,12 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); | |||
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); | |||
#endif |
@@ -0,0 +1,54 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.text | |||
nttunpack128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
#store | |||
vmovdqa %ymm9,(%rdi) | |||
vmovdqa %ymm8,32(%rdi) | |||
vmovdqa %ymm7,64(%rdi) | |||
vmovdqa %ymm6,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm3,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx): | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
ret |
@@ -0,0 +1,25 @@ | |||
.macro shuffle8 r0,r1,r2,r3 | |||
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle4 r0,r1,r2,r3 | |||
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 | |||
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vpsllq $32,%ymm\r1,%ymm\r2 | |||
vmovsldup %ymm\r1,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrlq $32,%ymm\r0,%ymm\r0 | |||
#vmovshdup %ymm\r0,%ymm\r0 | |||
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle1 r0,r1,r2,r3 | |||
vpslld $16,%ymm\r1,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrld $16,%ymm\r0,%ymm\r0 | |||
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm |
@@ -0,0 +1,415 @@ | |||
#include "align.h" | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) { | |||
switch (i) { | |||
case 0: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 1: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
case 2: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 3: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair | |||
* | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
unsigned int i; | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
polyvecl rowbuf[2]; | |||
polyvecl s1, *row = rowbuf; | |||
polyveck s2; | |||
poly t1, t0; | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Store rho, key */ | |||
memcpy(pk, rho, SEEDBYTES); | |||
memcpy(sk, rho, SEEDBYTES); | |||
memcpy(sk + SEEDBYTES, key, SEEDBYTES); | |||
/* Sample short vectors s1 and s2 */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, 4, 5, 6, 7); | |||
/* Pack secret vectors */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); | |||
} | |||
for (i = 0; i < K; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); | |||
} | |||
/* Transform s1 */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
polyvec_matrix_expand_row(&row, rowbuf, rho, i); | |||
/* Compute inner-product */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&t1); | |||
/* Add error polynomial */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&t1, &t1, &s2.vec[i]); | |||
/* Round t and pack t1, t0 */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&t1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); | |||
} | |||
/* Compute CRH(rho, t1) and store in secret key */ | |||
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature | |||
* | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
unsigned int i, n, pos; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint8_t hintbuf[N]; | |||
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
uint64_t nonce = 0; | |||
polyvecl mat[K], s1, z; | |||
polyveck t0, s2, w1; | |||
poly c, tmp; | |||
union { | |||
polyvecl y; | |||
polyveck w0; | |||
} tmpv; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
/* Compute CRH(tr, msg) */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3], | |||
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); | |||
nonce += 4; | |||
/* Matrix-vector product */ | |||
tmpv.y = z; | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&tmpv.y); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(sig, &w1); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c); | |||
/* Compute z, reject if it reveals secret */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&z.vec[i]); | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
} | |||
/* Zero hint vector in signature */ | |||
pos = 0; | |||
memset(hint, 0, OMEGA); | |||
for (i = 0; i < K; i++) { | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmpv.w0.vec[i]); | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmp); | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmp, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
n = PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); | |||
if (pos + n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Store hints in signature */ | |||
memcpy(&hint[pos], hintbuf, n); | |||
hint[OMEGA + i] = pos = pos + n; | |||
} | |||
/* Pack z into signature */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); | |||
} | |||
*siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign | |||
* | |||
* Description: Compute signed message. | |||
* | |||
* Arguments: - uint8_t *sm: pointer to output signed message (allocated | |||
* array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify | |||
* | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
unsigned int i, j, pos = 0; | |||
/* PQCLEAN_DILITHIUM2_AVX2_polyw1_pack writes additional 14 bytes */ | |||
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; | |||
uint8_t mu[CRHBYTES]; | |||
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
polyvecl rowbuf[2]; | |||
polyvecl *row = rowbuf; | |||
polyvecl z; | |||
poly c, w1, h; | |||
shake256incctx state; | |||
if (siglen != PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
/* Expand PQCLEAN_DILITHIUM2_AVX2_challenge */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c); | |||
/* Unpack z; shortness follows from unpacking */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&z.vec[i]); | |||
} | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
polyvec_matrix_expand_row(&row, rowbuf, pk, i); | |||
/* Compute i-th row of Az - c2^Dt1 */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&h, &c, &h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&w1); | |||
/* Get hint polynomial and reconstruct w1 */ | |||
memset(h.vec, 0, sizeof(poly)); | |||
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { | |||
return -1; | |||
} | |||
for (j = pos; j < hint[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > pos && hint[j] <= hint[j - 1]) { | |||
return -1; | |||
} | |||
h.coeffs[hint[j]] = 1; | |||
} | |||
pos = hint[OMEGA + i]; | |||
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = pos; j < OMEGA; ++j) { | |||
if (hint[j]) { | |||
return -1; | |||
} | |||
} | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM2_AVX2_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (buf.coeffs[i] != sig[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open | |||
* | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
badsig: | |||
/* Signature verification failed */ | |||
*mlen = -1; | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
return -1; | |||
} |
@@ -0,0 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_SIGN_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_SIGN_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,26 @@ | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake128_inc_init(state); | |||
shake128_inc_absorb(state, seed, SEEDBYTES); | |||
shake128_inc_absorb(state, t, 2); | |||
shake128_inc_finalize(state); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake256_inc_init(state); | |||
shake256_inc_absorb(state, seed, CRHBYTES); | |||
shake256_inc_absorb(state, t, 2); | |||
shake256_inc_finalize(state); | |||
} |
@@ -0,0 +1,36 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef shake128incctx stream128_state; | |||
typedef shake256incctx stream256_state; | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
#define STREAM128_BLOCKBYTES SHAKE128_RATE | |||
#define STREAM256_BLOCKBYTES SHAKE256_RATE | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) | |||
#define stream128_release(STATE) shake128_inc_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) | |||
#define stream256_release(STATE) shake256_inc_ctx_release(STATE) | |||
#endif |
@@ -0,0 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -0,0 +1,23 @@ | |||
# This Makefile can be used with Microsoft Visual Studio's nmake using the command: | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libdilithium2_clean.lib | |||
OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
# long as there has been a C standard, and we need it for constant-time | |||
# computations. Thus, we disable that spurious warning. | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 | |||
all: $(LIBRARY) | |||
# Make sure objects are recompiled if headers change. | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
-DEL $(LIBRARY) |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_API_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1312 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2544 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2420 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,98 @@ | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
static const int32_t zetas[N] = { | |||
0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, | |||
2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, | |||
-2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, | |||
-1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, | |||
3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, | |||
-671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, | |||
-3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, | |||
-3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, | |||
189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, | |||
1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, | |||
2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, | |||
266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, | |||
900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, | |||
-655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, | |||
342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, | |||
2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, | |||
-3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, | |||
-1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, | |||
-1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, | |||
-542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, | |||
-2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, | |||
-3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, | |||
-3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, | |||
-426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, | |||
-2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, | |||
-554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 | |||
}; | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_ntt | |||
* | |||
* Description: Forward NTT, in-place. No modular reduction is performed after | |||
* additions or subtractions. Output vector is in bitreversed order. | |||
* | |||
* Arguments: - uint32_t p[N]: input/output coefficient array | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]) { | |||
unsigned int len, start, j, k; | |||
int32_t zeta, t; | |||
k = 0; | |||
for (len = 128; len > 0; len >>= 1) { | |||
for (start = 0; start < N; start = j + len) { | |||
zeta = zetas[++k]; | |||
for (j = start; j < start + len; ++j) { | |||
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); | |||
a[j + len] = a[j] - t; | |||
a[j] = a[j] + t; | |||
} | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by Montgomery factor 2^32. | |||
* In-place. No modular reductions after additions or | |||
* subtractions; input coefficients need to be smaller than | |||
* Q in absolute value. Output coefficient are smaller than Q in | |||
* absolute value. | |||
* | |||
* Arguments: - uint32_t p[N]: input/output coefficient array | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]) { | |||
unsigned int start, len, j, k; | |||
int32_t t, zeta; | |||
const int32_t f = 41978; // mont^2/256 | |||
k = 256; | |||
for (len = 1; len < N; len <<= 1) { | |||
for (start = 0; start < N; start = j + len) { | |||
zeta = -zetas[--k]; | |||
for (j = start; j < start + len; ++j) { | |||
t = a[j]; | |||
a[j] = t + a[j + len]; | |||
a[j + len] = t - a[j + len]; | |||
a[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); | |||
} | |||
} | |||
} | |||
for (j = 0; j < N; ++j) { | |||
a[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)f * a[j]); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_NTT_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]); | |||
#endif |
@@ -0,0 +1,261 @@ | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk | |||
* | |||
* Description: Bit-pack public key pk = (rho, t1). | |||
* | |||
* Arguments: - uint8_t pk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_pk | |||
* | |||
* Description: Unpack public key pk = (rho, t1). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { | |||
return 1; | |||
} | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
} | |||
h->vec[i].coeffs[sig[j]] = 1; | |||
} | |||
k = sig[OMEGA + i]; | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]); | |||
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]); | |||
#endif |
@@ -0,0 +1,41 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 4 | |||
#define L 4 | |||
#define ETA 2 | |||
#define TAU 39 | |||
#define BETA 78 | |||
#define GAMMA1 (1 << 17) | |||
#define GAMMA2 ((Q-1)/88) | |||
#define OMEGA 80 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 576 | |||
#define POLYW1_PACKEDBYTES 192 | |||
#define POLYETA_PACKEDBYTES 96 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -0,0 +1,867 @@ | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "reduce.h" | |||
#include "rounding.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#define DBENCH_START() | |||
#define DBENCH_STOP(t) | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_reduce | |||
* | |||
* Description: Inplace reduction of all coefficients of polynomial to | |||
* representative in [-6283009,6283007]. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_caddq | |||
* | |||
* Description: For all coefficients of in/out polynomial add Q if | |||
* coefficient is negative. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_caddq(a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_freeze | |||
* | |||
* Description: Inplace reduction of all coefficients of polynomial to | |||
* standard representatives. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_freeze(a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_add | |||
* | |||
* Description: Add polynomials. No modular reduction is performed. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first summand | |||
* - const poly *b: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; | |||
} | |||
DBENCH_STOP(*tadd); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_sub | |||
* | |||
* Description: Subtract polynomials. No modular reduction is | |||
* performed. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial to be | |||
* subtraced from first input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; | |||
} | |||
DBENCH_STOP(*tadd); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl | |||
* | |||
* Description: Multiply polynomial by 2^D without modular reduction. Assumes | |||
* input coefficients to be less than 2^{31-D} in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] <<= D; | |||
} | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_ntt | |||
* | |||
* Description: Inplace forward NTT. Coefficients can grow by | |||
* 8*Q in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2_CLEAN_ntt(a->coeffs); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont | |||
* | |||
* Description: Inplace inverse NTT and multiplication by 2^{32}. | |||
* Input coefficients need to be less than Q in absolute | |||
* value and output coefficients are again bounded by Q. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(a->coeffs); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery | |||
* | |||
* Description: Pointwise multiplication of polynomials in NTT domain | |||
* representation and multiplication of resulting polynomial | |||
* by 2^{-32}. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_power2round | |||
* | |||
* Description: For all coefficients c of the input polynomial, | |||
* compute c0, c1 such that c mod Q = c1*2^D + c0 | |||
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 | |||
* - poly *a0: pointer to output polynomial with coefficients c0 | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_decompose | |||
* | |||
* Description: For all coefficients c of the input polynomial, | |||
* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 | |||
* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we | |||
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 | |||
* - poly *a0: pointer to output polynomial with coefficients c0 | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint | |||
* | |||
* Description: Compute hint polynomial. The coefficients of which indicate | |||
* whether the low bits of the corresponding coefficient of | |||
* the input polynomial overflow into the high bits. | |||
* | |||
* Arguments: - poly *h: pointer to output hint polynomial | |||
* - const poly *a0: pointer to low part of input polynomial | |||
* - const poly *a1: pointer to high part of input polynomial | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { | |||
unsigned int i, s = 0; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
h->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); | |||
s += h->coeffs[i]; | |||
} | |||
DBENCH_STOP(*tround); | |||
return s; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint | |||
* | |||
* Description: Use hint polynomial to correct the high bits of a polynomial. | |||
* | |||
* Arguments: - poly *b: pointer to output polynomial with corrected high bits | |||
* - const poly *a: pointer to input polynomial | |||
* - const poly *h: pointer to input hint polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
b->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm | |||
* | |||
* Description: Check infinity norm of polynomial against given bound. | |||
* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM2_CLEAN_reduce32(). | |||
* | |||
* Arguments: - const poly *a: pointer to polynomial | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B) { | |||
unsigned int i; | |||
int32_t t; | |||
DBENCH_START(); | |||
if (B > (Q - 1) / 8) { | |||
return 1; | |||
} | |||
/* It is ok to leak which coefficient violates the bound since | |||
the probability for each coefficient is independent of secret | |||
data but we must not leak the sign of the centralized representative. */ | |||
for (i = 0; i < N; ++i) { | |||
/* Absolute value */ | |||
t = a->coeffs[i] >> 31; | |||
t = a->coeffs[i] - (t & 2 * a->coeffs[i]); | |||
if (t >= B) { | |||
DBENCH_STOP(*tsample); | |||
return 1; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: rej_uniform | |||
* | |||
* Description: Sample uniformly random coefficients in [0, Q-1] by | |||
* performing rejection sampling on array of random bytes. | |||
* | |||
* Arguments: - int32_t *a: pointer to output array (allocated) | |||
* - unsigned int len: number of coefficients to be sampled | |||
* - const uint8_t *buf: array of random bytes | |||
* - unsigned int buflen: length of array of random bytes | |||
* | |||
* Returns number of sampled coefficients. Can be smaller than len if not enough | |||
* random bytes were given. | |||
**************************************************/ | |||
static unsigned int rej_uniform(int32_t *a, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint32_t t; | |||
DBENCH_START(); | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
t = buf[pos++]; | |||
t |= (uint32_t)buf[pos++] << 8; | |||
t |= (uint32_t)buf[pos++] << 16; | |||
t &= 0x7FFFFF; | |||
if (t < Q) { | |||
a[ctr++] = t; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return ctr; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [0,Q-1] by performing rejection sampling on the | |||
* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES | |||
* - uint16_t nonce: 2-byte nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce) { | |||
unsigned int i, ctr, off; | |||
unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; | |||
uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; | |||
stream128_state state; | |||
stream128_init(&state, seed, nonce); | |||
stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); | |||
ctr = rej_uniform(a->coeffs, N, buf, buflen); | |||
while (ctr < N) { | |||
off = buflen % 3; | |||
for (i = 0; i < off; ++i) { | |||
buf[i] = buf[buflen - off + i]; | |||
} | |||
stream128_squeezeblocks(buf + off, 1, &state); | |||
buflen = STREAM128_BLOCKBYTES + off; | |||
ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); | |||
} | |||
stream128_release(&state); | |||
} | |||
/************************************************* | |||
* Name: rej_eta | |||
* | |||
* Description: Sample uniformly random coefficients in [-ETA, ETA] by | |||
* performing rejection sampling on array of random bytes. | |||
* | |||
* Arguments: - int32_t *a: pointer to output array (allocated) | |||
* - unsigned int len: number of coefficients to be sampled | |||
* - const uint8_t *buf: array of random bytes | |||
* - unsigned int buflen: length of array of random bytes | |||
* | |||
* Returns number of sampled coefficients. Can be smaller than len if not enough | |||
* random bytes were given. | |||
**************************************************/ | |||
static unsigned int rej_eta(int32_t *a, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint32_t t0, t1; | |||
DBENCH_START(); | |||
ctr = pos = 0; | |||
while (ctr < len && pos < buflen) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (t0 < 15) { | |||
t0 = t0 - (205 * t0 >> 10) * 5; | |||
a[ctr++] = 2 - t0; | |||
} | |||
if (t1 < 15 && ctr < len) { | |||
t1 = t1 - (205 * t1 >> 10) * 5; | |||
a[ctr++] = 2 - t1; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return ctr; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [-ETA,ETA] by performing rejection sampling on the | |||
* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES | |||
* - uint16_t nonce: 2-byte nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce) { | |||
unsigned int ctr; | |||
unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; | |||
uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; | |||
stream128_state state; | |||
stream128_init(&state, seed, nonce); | |||
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); | |||
ctr = rej_eta(a->coeffs, N, buf, buflen); | |||
while (ctr < N) { | |||
stream128_squeezeblocks(buf, 1, &state); | |||
ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); | |||
} | |||
stream128_release(&state); | |||
} | |||
/************************************************* | |||
* Name: poly_uniform_gamma1m1 | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream | |||
* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length CRHBYTES | |||
* - uint16_t nonce: 16-bit nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce) { | |||
uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; | |||
stream256_state state; | |||
stream256_init(&state, seed, nonce); | |||
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); | |||
stream256_release(&state); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(a, buf); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge | |||
* | |||
* Description: Implementation of H. Samples polynomial with TAU nonzero | |||
* coefficients in {-1,1} using the output stream of | |||
* SHAKE256(seed). | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { | |||
unsigned int i, b, pos; | |||
uint64_t signs; | |||
uint8_t buf[SHAKE256_RATE]; | |||
shake256incctx state; | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, seed, SEEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf, sizeof buf, &state); | |||
signs = 0; | |||
for (i = 0; i < 8; ++i) { | |||
signs |= (uint64_t)buf[i] << 8 * i; | |||
} | |||
pos = 8; | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = 0; | |||
} | |||
for (i = N - TAU; i < N; ++i) { | |||
do { | |||
if (pos >= SHAKE256_RATE) { | |||
shake256_inc_squeeze(buf, sizeof buf, &state); | |||
pos = 0; | |||
} | |||
b = buf[pos++]; | |||
} while (b > i); | |||
c->coeffs[i] = c->coeffs[b]; | |||
c->coeffs[b] = 1 - 2 * (signs & 1); | |||
signs >>= 1; | |||
} | |||
shake256_inc_ctx_release(&state); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack | |||
* | |||
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYETA_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
uint8_t t[8]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
t[0] = (uint8_t) (ETA - a->coeffs[8 * i + 0]); | |||
t[1] = (uint8_t) (ETA - a->coeffs[8 * i + 1]); | |||
t[2] = (uint8_t) (ETA - a->coeffs[8 * i + 2]); | |||
t[3] = (uint8_t) (ETA - a->coeffs[8 * i + 3]); | |||
t[4] = (uint8_t) (ETA - a->coeffs[8 * i + 4]); | |||
t[5] = (uint8_t) (ETA - a->coeffs[8 * i + 5]); | |||
t[6] = (uint8_t) (ETA - a->coeffs[8 * i + 6]); | |||
t[7] = (uint8_t) (ETA - a->coeffs[8 * i + 7]); | |||
r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); | |||
r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); | |||
r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack | |||
* | |||
* Description: Unpack polynomial with coefficients in [-ETA,ETA]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; | |||
r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; | |||
r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; | |||
r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; | |||
r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; | |||
r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; | |||
r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; | |||
r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; | |||
r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; | |||
r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; | |||
r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; | |||
r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; | |||
r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; | |||
r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; | |||
r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; | |||
r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack | |||
* | |||
* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. | |||
* Input coefficients are assumed to be standard representatives. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYT1_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); | |||
r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); | |||
r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); | |||
r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); | |||
r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack | |||
* | |||
* Description: Unpack polynomial t1 with 10-bit coefficients. | |||
* Output coefficients are standard representatives. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; | |||
r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; | |||
r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; | |||
r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack | |||
* | |||
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYT0_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
uint32_t t[8]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; | |||
t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; | |||
t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; | |||
t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; | |||
t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; | |||
t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; | |||
t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; | |||
t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; | |||
r[13 * i + 0] = (uint8_t) t[0]; | |||
r[13 * i + 1] = (uint8_t) (t[0] >> 8); | |||
r[13 * i + 1] |= (uint8_t) (t[1] << 5); | |||
r[13 * i + 2] = (uint8_t) (t[1] >> 3); | |||
r[13 * i + 3] = (uint8_t) (t[1] >> 11); | |||
r[13 * i + 3] |= (uint8_t) (t[2] << 2); | |||
r[13 * i + 4] = (uint8_t) (t[2] >> 6); | |||
r[13 * i + 4] |= (uint8_t) (t[3] << 7); | |||
r[13 * i + 5] = (uint8_t) (t[3] >> 1); | |||
r[13 * i + 6] = (uint8_t) (t[3] >> 9); | |||
r[13 * i + 6] |= (uint8_t) (t[4] << 4); | |||
r[13 * i + 7] = (uint8_t) (t[4] >> 4); | |||
r[13 * i + 8] = (uint8_t) (t[4] >> 12); | |||
r[13 * i + 8] |= (uint8_t) (t[5] << 1); | |||
r[13 * i + 9] = (uint8_t) (t[5] >> 7); | |||
r[13 * i + 9] |= (uint8_t) (t[6] << 6); | |||
r[13 * i + 10] = (uint8_t) (t[6] >> 2); | |||
r[13 * i + 11] = (uint8_t) (t[6] >> 10); | |||
r[13 * i + 11] |= (uint8_t) (t[7] << 3); | |||
r[13 * i + 12] = (uint8_t) (t[7] >> 5); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack | |||
* | |||
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
r->coeffs[8 * i + 0] = a[13 * i + 0]; | |||
r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; | |||
r->coeffs[8 * i + 0] &= 0x1FFF; | |||
r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; | |||
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; | |||
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; | |||
r->coeffs[8 * i + 1] &= 0x1FFF; | |||
r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; | |||
r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; | |||
r->coeffs[8 * i + 2] &= 0x1FFF; | |||
r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; | |||
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; | |||
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; | |||
r->coeffs[8 * i + 3] &= 0x1FFF; | |||
r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; | |||
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; | |||
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; | |||
r->coeffs[8 * i + 4] &= 0x1FFF; | |||
r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; | |||
r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; | |||
r->coeffs[8 * i + 5] &= 0x1FFF; | |||
r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; | |||
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; | |||
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; | |||
r->coeffs[8 * i + 6] &= 0x1FFF; | |||
r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; | |||
r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; | |||
r->coeffs[8 * i + 7] &= 0x1FFF; | |||
r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; | |||
r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; | |||
r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; | |||
r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; | |||
r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; | |||
r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; | |||
r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; | |||
r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyz_pack | |||
* | |||
* Description: Bit-pack polynomial with coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYZ_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
uint32_t t[4]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
t[0] = GAMMA1 - a->coeffs[4 * i + 0]; | |||
t[1] = GAMMA1 - a->coeffs[4 * i + 1]; | |||
t[2] = GAMMA1 - a->coeffs[4 * i + 2]; | |||
t[3] = GAMMA1 - a->coeffs[4 * i + 3]; | |||
r[9 * i + 0] = (uint8_t) t[0]; | |||
r[9 * i + 1] = (uint8_t) (t[0] >> 8); | |||
r[9 * i + 2] = (uint8_t) (t[0] >> 16); | |||
r[9 * i + 2] |= (uint8_t) (t[1] << 2); | |||
r[9 * i + 3] = (uint8_t) (t[1] >> 6); | |||
r[9 * i + 4] = (uint8_t) (t[1] >> 14); | |||
r[9 * i + 4] |= (uint8_t) (t[2] << 4); | |||
r[9 * i + 5] = (uint8_t) (t[2] >> 4); | |||
r[9 * i + 6] = (uint8_t) (t[2] >> 12); | |||
r[9 * i + 6] |= (uint8_t) (t[3] << 6); | |||
r[9 * i + 7] = (uint8_t) (t[3] >> 2); | |||
r[9 * i + 8] = (uint8_t) (t[3] >> 10); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack | |||
* | |||
* Description: Unpack polynomial z with coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r->coeffs[4 * i + 0] = a[9 * i + 0]; | |||
r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; | |||
r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; | |||
r->coeffs[4 * i + 0] &= 0x3FFFF; | |||
r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; | |||
r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; | |||
r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; | |||
r->coeffs[4 * i + 1] &= 0x3FFFF; | |||
r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; | |||
r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; | |||
r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; | |||
r->coeffs[4 * i + 2] &= 0x3FFFF; | |||
r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; | |||
r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; | |||
r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; | |||
r->coeffs[4 * i + 3] &= 0x3FFFF; | |||
r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; | |||
r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; | |||
r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; | |||
r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack | |||
* | |||
* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. | |||
* Input coefficients are assumed to be standard representatives. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYW1_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r[3 * i + 0] = (uint8_t) a->coeffs[4 * i + 0]; | |||
r[3 * i + 0] |= (uint8_t) (a->coeffs[4 * i + 1] << 6); | |||
r[3 * i + 1] = (uint8_t) (a->coeffs[4 * i + 1] >> 2); | |||
r[3 * i + 1] |= (uint8_t) (a->coeffs[4 * i + 2] << 4); | |||
r[3 * i + 2] = (uint8_t) (a->coeffs[4 * i + 2] >> 4); | |||
r[3 * i + 2] |= (uint8_t) (a->coeffs[4 * i + 3] << 2); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} |
@@ -0,0 +1,53 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_POLY_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef struct { | |||
int32_t coeffs[N]; | |||
} poly; | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a); | |||
#endif |
@@ -0,0 +1,448 @@ | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
unsigned int i, j; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < L; ++j) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length L | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add | |||
* | |||
* Description: Add vectors of polynomials of length L. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyvecl *w: pointer to output vector | |||
* - const polyvecl *u: pointer to first summand | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length L. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v) { | |||
unsigned int i; | |||
poly t; | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); | |||
for (i = 1; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_add(w, w, &t); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_add | |||
* | |||
* Description: Add vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first summand | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
* - const polyveck *v: pointer to second input vector to be | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length K. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
* than 2*Q. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint | |||
* | |||
* Description: Compute hint vector. | |||
* | |||
* Arguments: - polyveck *h: pointer to output vector | |||
* - const polyveck *v0: pointer to low part of input vector | |||
* - const polyveck *v1: pointer to high part of input vector | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1) { | |||
unsigned int i, s = 0; | |||
for (i = 0; i < K; ++i) { | |||
s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return s; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint | |||
* | |||
* Description: Use hint vector to correct the high bits of input vector. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -0,0 +1,68 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
/* Vectors of polynomials of length K */ | |||
typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -0,0 +1,69 @@ | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce | |||
* | |||
* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, | |||
* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. | |||
* | |||
* Arguments: - int64_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a) { | |||
int32_t t; | |||
t = (int32_t)((uint64_t)a * (uint64_t)QINV); | |||
t = (a - (int64_t)t * Q) >> 32; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32 | |||
* | |||
* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, | |||
* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. | |||
* | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a) { | |||
int32_t t; | |||
t = (a + (1 << 22)) >> 23; | |||
t = a - t * Q; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_caddq | |||
* | |||
* Description: Add Q if input coefficient is negative. | |||
* | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a) { | |||
a += (a >> 31) & Q; | |||
return a; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_freeze | |||
* | |||
* Description: For finite field element a, compute standard | |||
* representative r = a mod^+ Q. | |||
* | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a) { | |||
a = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a); | |||
a = PQCLEAN_DILITHIUM2_CLEAN_caddq(a); | |||
return a; | |||
} |
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define MONT (-4186625) // 2^32 % Q | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a); | |||
#endif |
@@ -0,0 +1,98 @@ | |||
#include "params.h" | |||
#include "rounding.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_power2round | |||
* | |||
* Description: For finite field element a, compute a0, a1 such that | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be standard representative. | |||
* | |||
* Arguments: - int32_t a: input element | |||
* - int32_t *a0: pointer to output element a0 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a) { | |||
int32_t a1; | |||
a1 = (a + (1 << (D - 1)) - 1) >> D; | |||
*a0 = a - (a1 << D); | |||
return a1; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_decompose | |||
* | |||
* Description: For finite field element a, compute high and low bits a0, a1 such | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard | |||
* representative. | |||
* | |||
* Arguments: - int32_t a: input element | |||
* - int32_t *a0: pointer to output element a0 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) { | |||
int32_t a1; | |||
a1 = (a + 127) >> 7; | |||
a1 = (a1 * 11275 + (1 << 23)) >> 24; | |||
a1 ^= ((43 - a1) >> 31) & a1; | |||
*a0 = a - a1 * 2 * GAMMA2; | |||
*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; | |||
return a1; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint | |||
* | |||
* Description: Compute hint bit indicating whether the low bits of the | |||
* input element overflow into the high bits. | |||
* | |||
* Arguments: - int32_t a0: low bits of input element | |||
* - int32_t a1: high bits of input element | |||
* | |||
* Returns 1 if overflow. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1) { | |||
if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_use_hint | |||
* | |||
* Description: Correct high bits according to hint. | |||
* | |||
* Arguments: - int32_t a: input element | |||
* - unsigned int hint: hint bit | |||
* | |||
* Returns corrected high bits. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint) { | |||
int32_t a0, a1; | |||
a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0, a); | |||
if (hint == 0) { | |||
return a1; | |||
} | |||
if (a0 > 0) { | |||
if (a1 == 43) { | |||
return 0; | |||
} | |||
return a1 + 1; | |||
} | |||
if (a1 == 0) { | |||
return 43; | |||
} | |||
return a1 - 1; | |||
} |
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint); | |||
#endif |
@@ -0,0 +1,343 @@ | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair | |||
* | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
uint8_t tr[CRHBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
polyvecl mat[K]; | |||
polyvecl s1, s1hat; | |||
polyveck s2, t1, t0; | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Expand matrix */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); | |||
/* Sample short vectors s1 and s2 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); | |||
/* Matrix-vector multiplication */ | |||
s1hat = s1; | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1hat); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&t1); | |||
/* Add error vector s2 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t1, &t1, &s2); | |||
/* Extract t1 and write public key */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1); | |||
/* Compute CRH(rho, t1) and write secret key */ | |||
crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature | |||
* | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, | |||
size_t *siglen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *sk) { | |||
unsigned int n; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint16_t nonce = 0; | |||
polyvecl mat[K], s1, y, z; | |||
polyveck t0, s2, w1, w0, h; | |||
poly cp; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
/* Compute CRH(tr, msg) */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); | |||
/* Matrix-vector multiplication */ | |||
z = y; | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(sig, &w1); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, sig); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp); | |||
/* Compute z, reject if it reveals secret */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(&z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(&z, &z, &y); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(&z); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w0); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints for w1 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&h); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&h, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &h); | |||
n = PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(&h, &w0, &w1); | |||
if (n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Write signature */ | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, sig, &z, &h); | |||
*siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign | |||
* | |||
* Description: Compute signed message. | |||
* | |||
* Arguments: - uint8_t *sm: pointer to output signed message (allocated | |||
* array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, | |||
size_t *smlen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify | |||
* | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, | |||
size_t siglen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *pk) { | |||
unsigned int i; | |||
uint8_t buf[K * POLYW1_PACKEDBYTES]; | |||
uint8_t rho[SEEDBYTES]; | |||
uint8_t mu[CRHBYTES]; | |||
uint8_t c[SEEDBYTES]; | |||
uint8_t c2[SEEDBYTES]; | |||
poly cp; | |||
polyvecl mat[K], z; | |||
polyveck t1, w1, h; | |||
shake256incctx state; | |||
if (siglen != PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(rho, &t1, pk); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(c, &z, &h, sig)) { | |||
return -1; | |||
} | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
return -1; | |||
} | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
/* Matrix-vector multiplication; compute Az - c2^dt1 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, c); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w1, &w1, &t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1); | |||
/* Reconstruct w1 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(buf, &w1); | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM2_CLEAN_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(c2, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (c[i] != c2[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open | |||
* | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, | |||
size_t *mlen, | |||
const uint8_t *sm, | |||
size_t smlen, | |||
const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
badsig: | |||
/* Signature verification failed */ | |||
*mlen = (size_t) -1; | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
return -1; | |||
} |
@@ -0,0 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,26 @@ | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake128_inc_init(state); | |||
shake128_inc_absorb(state, seed, SEEDBYTES); | |||
shake128_inc_absorb(state, t, 2); | |||
shake128_inc_finalize(state); | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake256_inc_init(state); | |||
shake256_inc_absorb(state, seed, CRHBYTES); | |||
shake256_inc_absorb(state, t, 2); | |||
shake256_inc_finalize(state); | |||
} |
@@ -0,0 +1,36 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef shake128incctx stream128_state; | |||
typedef shake256incctx stream256_state; | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
#define STREAM128_BLOCKBYTES SHAKE128_RATE | |||
#define STREAM256_BLOCKBYTES SHAKE256_RATE | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) | |||
#define stream128_release(STATE) shake128_inc_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) | |||
#define stream256_release(STATE) shake256_inc_ctx_release(STATE) | |||
#endif |
@@ -0,0 +1,31 @@ | |||
name: Dilithium3 | |||
type: signature | |||
claimed-nist-level: 3 | |||
length-public-key: 1952 | |||
length-secret-key: 4016 | |||
length-signature: 3293 | |||
nistkat-sha256: d0d4bb6945e14206d17b52f8a395d5a750ec8a73f2ea06b9f1cd226d225a9bfb | |||
testvectors-sha256: 531b85dbecaeaf135ad9004c8e2d5ce163b8e72d9c3a537e15bd383cf5f38aa4 | |||
principal-submitters: | |||
- Vadim Lyubashevsky | |||
auxiliary-submitters: | |||
- Léo Ducas | |||
- Eike Kiltz | |||
- Tancrède Lepoint | |||
- Peter Schwabe | |||
- Gregor Seiler | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- aes | |||
- avx2 | |||
- popcnt |
@@ -0,0 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_ALIGN_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[N]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGNED_INT32(N) \ | |||
union { \ | |||
int32_t coeffs[N]; \ | |||
__m256i vec[((N)+7)/8]; \ | |||
} | |||
#endif |
@@ -0,0 +1,32 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_API_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES 1952 | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES 4016 | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES 3293 | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3" | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,24 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_CDECL_H | |||
#define _8XQ 0 | |||
#define _8XQINV 8 | |||
#define _8XDIV_QINV 16 | |||
#define _8XDIV 24 | |||
#define _ZETAS_QINV 32 | |||
#define _ZETAS 328 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
* C files (functions) can't be found, and all symbols we | |||
* refer to from ASM also can't be found (nttconsts.c). | |||
* | |||
* This define helps us get around this | |||
*/ | |||
#define _cdecl(s) _##s | |||
#define cdecl(s) s | |||
#endif |
@@ -0,0 +1,101 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
#define MONT (-4186625) // 2^32 mod q | |||
#define DIV 41978 // mont^2/256 | |||
#define DIV_QINV (-8395782) | |||
const qdata_t PQCLEAN_DILITHIUM3_AVX2_qdata = {{ | |||
//#define _8XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, | |||
//#define _8XQINV 8 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
//#define _8XDIV_QINV 16 | |||
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, | |||
//#define _8XDIV 24 | |||
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, | |||
//#define _ZETAS_QINV 32 | |||
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, | |||
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, | |||
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, | |||
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, | |||
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, | |||
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, | |||
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, | |||
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, | |||
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, | |||
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, | |||
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, | |||
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, | |||
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, | |||
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, | |||
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, | |||
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, | |||
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, | |||
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, | |||
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, | |||
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, | |||
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, | |||
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, | |||
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, | |||
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, | |||
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, | |||
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, | |||
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, | |||
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, | |||
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, | |||
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, | |||
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, | |||
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, | |||
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, | |||
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, | |||
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, | |||
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, | |||
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, | |||
//#define _ZETAS 328 | |||
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, | |||
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, | |||
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, | |||
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, | |||
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, | |||
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, | |||
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, | |||
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, | |||
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, | |||
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, | |||
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, | |||
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, | |||
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, | |||
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, | |||
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, | |||
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, | |||
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, | |||
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, | |||
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, | |||
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, | |||
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, | |||
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, | |||
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, | |||
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, | |||
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, | |||
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, | |||
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, | |||
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, | |||
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, | |||
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, | |||
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, | |||
} | |||
}; |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_CONSTS_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_CONSTS_H | |||
#include "align.h" | |||
#include "cdecl.h" | |||
typedef ALIGNED_INT32(624) qdata_t; | |||
extern const qdata_t PQCLEAN_DILITHIUM3_AVX2_qdata; | |||
#endif |
@@ -0,0 +1,909 @@ | |||
/* Taken from Bas Westerbaan's new 4-way SHAKE implementation | |||
* for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), | |||
* but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ | |||
#include "cdecl.h" | |||
.data | |||
.p2align 5 | |||
rho8: | |||
.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 | |||
rho56: | |||
.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4) | |||
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4): | |||
_cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4): | |||
vmovdqa rho8(%rip), %ymm0 | |||
movq $6, %rax | |||
looptop: | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 192(%rdi), %ymm4, %ymm9 | |||
vpxor 384(%rdi), %ymm3, %ymm10 | |||
vpxor 576(%rdi), %ymm2, %ymm11 | |||
vpxor 768(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 0(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 96(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 320(%rdi), %ymm5, %ymm10 | |||
vpxor 512(%rdi), %ymm4, %ymm11 | |||
vpxor 704(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 32(%rdi), %ymm4, %ymm8 | |||
vpxor 224(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 608(%rdi), %ymm1, %ymm11 | |||
vpxor 640(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 128(%rdi), %ymm1, %ymm8 | |||
vpxor 160(%rdi), %ymm5, %ymm9 | |||
vpxor 352(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 736(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 64(%rdi), %ymm3, %ymm8 | |||
vpxor 256(%rdi), %ymm2, %ymm9 | |||
vpxor 448(%rdi), %ymm1, %ymm10 | |||
vpxor 480(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 512(%rdi), %ymm4, %ymm9 | |||
vpxor 224(%rdi), %ymm3, %ymm10 | |||
vpxor 736(%rdi), %ymm2, %ymm11 | |||
vpxor 448(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 8(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 576(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 640(%rdi), %ymm5, %ymm10 | |||
vpxor 352(%rdi), %ymm4, %ymm11 | |||
vpxor 64(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 192(%rdi), %ymm4, %ymm8 | |||
vpxor 704(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 128(%rdi), %ymm1, %ymm11 | |||
vpxor 480(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 768(%rdi), %ymm1, %ymm8 | |||
vpxor 320(%rdi), %ymm5, %ymm9 | |||
vpxor 32(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 256(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 384(%rdi), %ymm3, %ymm8 | |||
vpxor 96(%rdi), %ymm2, %ymm9 | |||
vpxor 608(%rdi), %ymm1, %ymm10 | |||
vpxor 160(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 352(%rdi), %ymm4, %ymm9 | |||
vpxor 704(%rdi), %ymm3, %ymm10 | |||
vpxor 256(%rdi), %ymm2, %ymm11 | |||
vpxor 608(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 16(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 736(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 480(%rdi), %ymm5, %ymm10 | |||
vpxor 32(%rdi), %ymm4, %ymm11 | |||
vpxor 384(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 512(%rdi), %ymm4, %ymm8 | |||
vpxor 64(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 768(%rdi), %ymm1, %ymm11 | |||
vpxor 160(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 448(%rdi), %ymm1, %ymm8 | |||
vpxor 640(%rdi), %ymm5, %ymm9 | |||
vpxor 192(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 96(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 224(%rdi), %ymm3, %ymm8 | |||
vpxor 576(%rdi), %ymm2, %ymm9 | |||
vpxor 128(%rdi), %ymm1, %ymm10 | |||
vpxor 320(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 32(%rdi), %ymm4, %ymm9 | |||
vpxor 64(%rdi), %ymm3, %ymm10 | |||
vpxor 96(%rdi), %ymm2, %ymm11 | |||
vpxor 128(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 24(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 256(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 160(%rdi), %ymm5, %ymm10 | |||
vpxor 192(%rdi), %ymm4, %ymm11 | |||
vpxor 224(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 352(%rdi), %ymm4, %ymm8 | |||
vpxor 384(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 448(%rdi), %ymm1, %ymm11 | |||
vpxor 320(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 608(%rdi), %ymm1, %ymm8 | |||
vpxor 480(%rdi), %ymm5, %ymm9 | |||
vpxor 512(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 576(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 704(%rdi), %ymm3, %ymm8 | |||
vpxor 736(%rdi), %ymm2, %ymm9 | |||
vpxor 768(%rdi), %ymm1, %ymm10 | |||
vpxor 640(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
addq $32, %rsi | |||
subq $1, %rax | |||
jnz looptop | |||
ret |
@@ -0,0 +1,219 @@ | |||
#include "fips202.h" | |||
#include "fips202x4.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define NROUNDS 24 | |||
/* Keccak round constants */ | |||
static const uint64_t KeccakF_RoundConstants[NROUNDS] = { | |||
(uint64_t)0x0000000000000001ULL, | |||
(uint64_t)0x0000000000008082ULL, | |||
(uint64_t)0x800000000000808aULL, | |||
(uint64_t)0x8000000080008000ULL, | |||
(uint64_t)0x000000000000808bULL, | |||
(uint64_t)0x0000000080000001ULL, | |||
(uint64_t)0x8000000080008081ULL, | |||
(uint64_t)0x8000000000008009ULL, | |||
(uint64_t)0x000000000000008aULL, | |||
(uint64_t)0x0000000000000088ULL, | |||
(uint64_t)0x0000000080008009ULL, | |||
(uint64_t)0x000000008000000aULL, | |||
(uint64_t)0x000000008000808bULL, | |||
(uint64_t)0x800000000000008bULL, | |||
(uint64_t)0x8000000000008089ULL, | |||
(uint64_t)0x8000000000008003ULL, | |||
(uint64_t)0x8000000000008002ULL, | |||
(uint64_t)0x8000000000000080ULL, | |||
(uint64_t)0x000000000000800aULL, | |||
(uint64_t)0x800000008000000aULL, | |||
(uint64_t)0x8000000080008081ULL, | |||
(uint64_t)0x8000000000008080ULL, | |||
(uint64_t)0x0000000080000001ULL, | |||
(uint64_t)0x8000000080008008ULL | |||
}; | |||
static void keccakx4_absorb_once(__m256i s[25], | |||
unsigned int r, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen, | |||
uint8_t p) { | |||
size_t i; | |||
uint64_t pos = 0; | |||
__m256i t, idx; | |||
for (i = 0; i < 25; ++i) { | |||
s[i] = _mm256_setzero_si256(); | |||
} | |||
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); | |||
while (inlen >= r) { | |||
for (i = 0; i < r / 8; ++i) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= r; | |||
PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); | |||
} | |||
for (i = 0; i < inlen / 8; ++i) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= 8 * i; | |||
if (inlen) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); | |||
t = _mm256_and_si256(t, idx); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
} | |||
t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
t = _mm256_set1_epi64x((long long)(1ULL << 63)); | |||
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); | |||
} | |||
static void keccakx4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
unsigned int r, | |||
__m256i s[25]) { | |||
unsigned int i; | |||
__m128d t; | |||
while (nblocks > 0) { | |||
PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); | |||
for (i = 0; i < r / 8; ++i) { | |||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | |||
_mm_storel_pd((double *)&out0[8 * i], t); | |||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | |||
_mm_storel_pd((double *)&out2[8 * i], t); | |||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||
} | |||
out0 += r; | |||
out1 += r; | |||
out2 += r; | |||
out3 += r; | |||
--nblocks; | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
unsigned int i; | |||
size_t nblocks = outlen / SHAKE128_RATE; | |||
uint8_t t[4][SHAKE128_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE128_RATE; | |||
out1 += nblocks * SHAKE128_RATE; | |||
out2 += nblocks * SHAKE128_RATE; | |||
out3 += nblocks * SHAKE128_RATE; | |||
outlen -= nblocks * SHAKE128_RATE; | |||
if (outlen) { | |||
PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
unsigned int i; | |||
size_t nblocks = outlen / SHAKE256_RATE; | |||
uint8_t t[4][SHAKE256_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE256_RATE; | |||
out1 += nblocks * SHAKE256_RATE; | |||
out2 += nblocks * SHAKE256_RATE; | |||
out3 += nblocks * SHAKE256_RATE; | |||
outlen -= nblocks * SHAKE256_RATE; | |||
if (outlen) { | |||
PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} |
@@ -0,0 +1,64 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
typedef struct { | |||
__m256i s[25]; | |||
} keccakx4_state; | |||
void PQCLEAN_DILITHIUM3_AVX2_f1600x4(__m256i *s, const uint64_t *rc); | |||
void PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_DILITHIUM3_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM3_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
#endif |
@@ -0,0 +1,240 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpsubd %ymm\l,%ymm\h,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vpmuldq %ymm\zl0,%ymm12,%ymm13 | |||
vmovshdup %ymm12,%ymm\h | |||
vpmuldq %ymm\zl1,%ymm\h,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm12,%ymm12 | |||
vpmuldq %ymm\zh1,%ymm\h,%ymm\h | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpsubd %ymm13,%ymm12,%ymm12 | |||
vpsubd %ymm14,%ymm\h,%ymm\h | |||
vmovshdup %ymm12,%ymm12 | |||
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h | |||
.endm | |||
.macro levels0t5 off | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
/* level 0 */ | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,5,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 6,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,9,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 10,11,1,3,2,15 | |||
/* level 1 */ | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,6,1,3,2,15 | |||
butterfly 5,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,10,1,3,2,15 | |||
butterfly 9,11,1,3,2,15 | |||
/* level 2 */ | |||
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,8,1,3,2,15 | |||
butterfly 5,9,1,3,2,15 | |||
butterfly 6,10,1,3,2,15 | |||
butterfly 7,11,1,3,2,15 | |||
/* level 3 */ | |||
shuffle2 4,5,3,5 | |||
shuffle2 6,7,4,7 | |||
shuffle2 8,9,6,9 | |||
shuffle2 10,11,8,11 | |||
vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 4,7 | |||
butterfly 6,9 | |||
butterfly 8,11 | |||
/* level 4 */ | |||
shuffle4 3,4,10,4 | |||
shuffle4 6,8,3,8 | |||
shuffle4 5,7,6,7 | |||
shuffle4 9,11,5,11 | |||
vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 10,4 | |||
butterfly 3,8 | |||
butterfly 6,7 | |||
butterfly 5,11 | |||
/* level 5 */ | |||
shuffle8 10,3,9,3 | |||
shuffle8 6,5,10,5 | |||
shuffle8 4,8,6,8 | |||
shuffle8 7,11,4,11 | |||
vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 | |||
butterfly 9,3 | |||
butterfly 10,5 | |||
butterfly 6,8 | |||
butterfly 4,11 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm10,256*\off+ 32(%rdi) | |||
vmovdqa %ymm6,256*\off+ 64(%rdi) | |||
vmovdqa %ymm4,256*\off+ 96(%rdi) | |||
vmovdqa %ymm3,256*\off+128(%rdi) | |||
vmovdqa %ymm5,256*\off+160(%rdi) | |||
vmovdqa %ymm8,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
.macro levels6t7 off | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
/* level 6 */ | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
/* level 7 */ | |||
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 | |||
vmovdqa (_8XDIV)*4(%rsi),%ymm2 | |||
vpmuldq %ymm1,%ymm4,%ymm12 | |||
vpmuldq %ymm1,%ymm5,%ymm13 | |||
vmovshdup %ymm4,%ymm8 | |||
vmovshdup %ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm4,%ymm4 | |||
vpmuldq %ymm2,%ymm5,%ymm5 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm4,%ymm4 | |||
vpsubd %ymm13,%ymm5,%ymm5 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm4,%ymm4 | |||
vmovshdup %ymm5,%ymm5 | |||
vpblendd $0xAA,%ymm8,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm9,%ymm5,%ymm5 | |||
vpmuldq %ymm1,%ymm6,%ymm12 | |||
vpmuldq %ymm1,%ymm7,%ymm13 | |||
vmovshdup %ymm6,%ymm8 | |||
vmovshdup %ymm7,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm6,%ymm6 | |||
vpmuldq %ymm2,%ymm7,%ymm7 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm6,%ymm6 | |||
vpsubd %ymm13,%ymm7,%ymm7 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm6,%ymm6 | |||
vmovshdup %ymm7,%ymm7 | |||
vpblendd $0xAA,%ymm8,%ymm6,%ymm6 | |||
vpblendd $0xAA,%ymm9,%ymm7,%ymm7 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t5 0 | |||
levels0t5 1 | |||
levels0t5 2 | |||
levels0t5 3 | |||
levels6t7 0 | |||
levels6t7 1 | |||
levels6t7 2 | |||
levels6t7 3 | |||
ret |
@@ -0,0 +1,199 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpmuldq %ymm\zl0,%ymm\h,%ymm13 | |||
vmovshdup %ymm\h,%ymm12 | |||
vpmuldq %ymm\zl1,%ymm12,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm\h,%ymm\h | |||
vpmuldq %ymm\zh1,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vmovshdup %ymm\h,%ymm\h | |||
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h | |||
vpsubd %ymm\h,%ymm\l,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vmovshdup %ymm13,%ymm13 | |||
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 | |||
vpaddd %ymm13,%ymm12,%ymm\h | |||
vpsubd %ymm13,%ymm\l,%ymm\l | |||
.endm | |||
.macro levels0t1 off | |||
/* level 0 */ | |||
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
/* level 1 */ | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
.endm | |||
.macro levels2t7 off | |||
/* level 2 */ | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
/* level 3 */ | |||
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 8,10 | |||
butterfly 4,6 | |||
butterfly 9,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
/* level 4 */ | |||
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 | |||
butterfly 7,8 | |||
butterfly 5,6 | |||
butterfly 3,4 | |||
butterfly 10,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
/* level 5 */ | |||
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,5,1,10,2,15 | |||
butterfly 8,4,1,10,2,15 | |||
butterfly 7,3,1,10,2,15 | |||
butterfly 6,11,1,10,2,15 | |||
/* level 6 */ | |||
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,7,1,10,2,15 | |||
butterfly 8,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,3,1,10,2,15 | |||
butterfly 4,11,1,10,2,15 | |||
/* level 7 */ | |||
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,8,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 7,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,4,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 3,11,1,10,2,15 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm8,256*\off+ 32(%rdi) | |||
vmovdqa %ymm7,256*\off+ 64(%rdi) | |||
vmovdqa %ymm6,256*\off+ 96(%rdi) | |||
vmovdqa %ymm5,256*\off+128(%rdi) | |||
vmovdqa %ymm4,256*\off+160(%rdi) | |||
vmovdqa %ymm3,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t1 0 | |||
levels0t1 1 | |||
levels0t1 2 | |||
levels0t1 3 | |||
levels2t7 0 | |||
levels2t7 1 | |||
levels2t7 2 | |||
levels2t7 3 | |||
ret | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_NTT_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_NTT_H | |||
#include <immintrin.h> | |||
void PQCLEAN_DILITHIUM3_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM3_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx(__m256i *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); | |||
#endif |
@@ -0,0 +1,261 @@ | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_pk | |||
* | |||
* Description: Bit-pack public key pk = (rho, t1). | |||
* | |||
* Arguments: - uint8_t pk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_pk | |||
* | |||
* Description: Unpack public key pk = (rho, t1). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3_AVX2_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3_AVX2_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { | |||
return 1; | |||
} | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
} | |||
h->vec[i].coeffs[sig[j]] = 1; | |||
} | |||
k = sig[OMEGA + i]; | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_PACKING_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_PACKING_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM3_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM3_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]); | |||
int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]); | |||
#endif |
@@ -0,0 +1,41 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_PARAMS_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 6 | |||
#define L 5 | |||
#define ETA 4 | |||
#define TAU 49 | |||
#define BETA 196 | |||
#define GAMMA1 (1 << 19) | |||
#define GAMMA2 ((Q-1)/32) | |||
#define OMEGA 55 | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 640 | |||
#define POLYW1_PACKEDBYTES 128 | |||
#define POLYETA_PACKEDBYTES 128 | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -0,0 +1,201 @@ | |||
#include "params.h" | |||
#include "cdecl.h" | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): | |||
_cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
#load | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
vmovdqa 64(%rsi),%ymm6 | |||
vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vmovdqa 64(%rdx),%ymm14 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vmovshdup %ymm6,%ymm7 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
vpsrlq $32,%ymm12,%ymm13 | |||
vmovshdup %ymm14,%ymm15 | |||
#mul | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
vpmuldq %ymm6,%ymm14,%ymm6 | |||
vpmuldq %ymm7,%ymm15,%ymm7 | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm0,%ymm6,%ymm14 | |||
vpmuldq %ymm0,%ymm7,%ymm15 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpmuldq %ymm1,%ymm14,%ymm14 | |||
vpmuldq %ymm1,%ymm15,%ymm15 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsubq %ymm14,%ymm6,%ymm6 | |||
vpsubq %ymm15,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vmovshdup %ymm6,%ymm6 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
vpblendd $0xAA,%ymm5,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm7,%ymm6,%ymm6 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
add $96,%rdi | |||
add $96,%rsi | |||
add $96,%rdx | |||
add $1,%eax | |||
cmp $10,%eax | |||
jb _looptop1 | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0x55,%ymm2,%ymm3,%ymm2 | |||
vpblendd $0x55,%ymm4,%ymm5,%ymm4 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
ret | |||
.macro pointwise off | |||
#load | |||
vmovdqa \off(%rsi),%ymm6 | |||
vmovdqa \off+32(%rsi),%ymm8 | |||
vmovdqa \off(%rdx),%ymm10 | |||
vmovdqa \off+32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm6,%ymm7 | |||
vpsrlq $32,%ymm8,%ymm9 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuldq %ymm6,%ymm10,%ymm6 | |||
vpmuldq %ymm7,%ymm11,%ymm7 | |||
vpmuldq %ymm8,%ymm12,%ymm8 | |||
vpmuldq %ymm9,%ymm13,%ymm9 | |||
.endm | |||
.macro acc | |||
vpaddq %ymm6,%ymm2,%ymm2 | |||
vpaddq %ymm7,%ymm3,%ymm3 | |||
vpaddq %ymm8,%ymm4,%ymm4 | |||
vpaddq %ymm9,%ymm5,%ymm5 | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): | |||
_cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
pointwise 0 | |||
#mov | |||
vmovdqa %ymm6,%ymm2 | |||
vmovdqa %ymm7,%ymm3 | |||
vmovdqa %ymm8,%ymm4 | |||
vmovdqa %ymm9,%ymm5 | |||
pointwise 1024 | |||
acc | |||
pointwise 2048 | |||
acc | |||
pointwise 3072 | |||
acc | |||
pointwise 4096 | |||
acc | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm6 | |||
vpmuldq %ymm0,%ymm3,%ymm7 | |||
vpmuldq %ymm0,%ymm4,%ymm8 | |||
vpmuldq %ymm0,%ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm6,%ymm6 | |||
vpmuldq %ymm1,%ymm7,%ymm7 | |||
vpmuldq %ymm1,%ymm8,%ymm8 | |||
vpmuldq %ymm1,%ymm9,%ymm9 | |||
vpsubq %ymm6,%ymm2,%ymm2 | |||
vpsubq %ymm7,%ymm3,%ymm3 | |||
vpsubq %ymm8,%ymm4,%ymm4 | |||
vpsubq %ymm9,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
vpblendd $0xAA,%ymm5,%ymm4,%ymm4 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
add $64,%rsi | |||
add $64,%rdx | |||
add $64,%rdi | |||
add $1,%eax | |||
cmp $16,%eax | |||
jb _looptop2 | |||
ret |
@@ -0,0 +1,79 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_POLY_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_POLY_H | |||
#include "align.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
typedef ALIGNED_INT32(N) poly; | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); | |||
#endif |
@@ -0,0 +1,498 @@ | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
#define UNUSED(x) (void)x | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
polyvecl tmp; | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(&mat[0], &mat[1], rho); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(&mat[1], &mat[2], rho); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(&mat[2], &mat[3], rho); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(&mat[4], &mat[5], rho); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(&mat[5], &tmp, rho); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[4], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 4, 256, 257, 258); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[2]); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowb->vec[0], &rowb->vec[1], rho, 259, 260, 512, 513); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[2], &rowa->vec[3], &rowa->vec[4], &rowb->vec[0], rho, 514, 515, 516, 768); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[1], &rowa->vec[2], &rowa->vec[3], &rowa->vec[4], rho, 769, 770, 771, 772); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 1024, 1025, 1026, 1027); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[4], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 1028, 1280, 1281, 1282); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[2]); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowb->vec[0], &rowb->vec[1], rho, 1283, 1284, 1536, 1537); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length L | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_add | |||
* | |||
* Description: Add vectors of polynomials of length L. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyvecl *w: pointer to output vector | |||
* - const polyvecl *u: pointer to first summand | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length L. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { | |||
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM3_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_add | |||
* | |||
* Description: Add vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first summand | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
* - const polyveck *v: pointer to second input vector to be | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length K. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
* than 2*Q. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint | |||
* | |||
* Description: Compute hint vector. | |||
* | |||
* Arguments: - uint8_t *hint: pointer to output hint array | |||
* - const polyveck *v0: pointer to low part of input vector | |||
* - const polyveck *v1: pointer to high part of input vector | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { | |||
unsigned int i, n = 0; | |||
for (i = 0; i < K; ++i) { | |||
n += PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint | |||
* | |||
* Description: Use hint vector to correct the high bits of input vector. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -0,0 +1,72 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
/* Vectors of polynomials of length K */ | |||
typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -0,0 +1,392 @@ | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8] = { | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 0, 0, 0, 0, 0}, | |||
{ 3, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 0, 0, 0, 0, 0}, | |||
{ 2, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 0, 0, 0, 0}, | |||
{ 4, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 0, 0, 0, 0}, | |||
{ 3, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 0, 0, 0, 0}, | |||
{ 2, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 0, 0, 0}, | |||
{ 5, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 0, 0, 0, 0, 0}, | |||
{ 2, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 0, 0, 0, 0}, | |||
{ 3, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 0, 0, 0, 0}, | |||
{ 2, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 0, 0, 0}, | |||
{ 4, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 0, 0, 0, 0}, | |||
{ 2, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 0, 0, 0}, | |||
{ 3, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 0, 0, 0}, | |||
{ 2, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 0, 0}, | |||
{ 6, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 6, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 6, 0, 0, 0, 0}, | |||
{ 3, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 6, 0, 0, 0, 0}, | |||
{ 2, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 6, 0, 0, 0}, | |||
{ 4, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 6, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 6, 0, 0, 0}, | |||
{ 3, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 6, 0, 0, 0}, | |||
{ 2, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 6, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 6, 0, 0}, | |||
{ 5, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 6, 0, 0, 0, 0}, | |||
{ 2, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 6, 0, 0, 0}, | |||
{ 3, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 6, 0, 0, 0}, | |||
{ 2, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 6, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 6, 0, 0}, | |||
{ 4, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 6, 0, 0, 0}, | |||
{ 2, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 6, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 6, 0, 0}, | |||
{ 3, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 6, 0, 0}, | |||
{ 2, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 6, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 6, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 6, 0}, | |||
{ 7, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 7, 0, 0, 0, 0, 0}, | |||
{ 2, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 7, 0, 0, 0, 0}, | |||
{ 3, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 7, 0, 0, 0, 0}, | |||
{ 2, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 7, 0, 0, 0}, | |||
{ 4, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 7, 0, 0, 0, 0}, | |||
{ 2, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 7, 0, 0, 0}, | |||
{ 3, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 7, 0, 0, 0}, | |||
{ 2, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 7, 0, 0}, | |||
{ 5, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 7, 0, 0, 0, 0}, | |||
{ 2, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 7, 0, 0, 0}, | |||
{ 3, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 7, 0, 0, 0}, | |||
{ 2, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 7, 0, 0}, | |||
{ 4, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 7, 0, 0, 0}, | |||
{ 2, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 7, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 7, 0, 0}, | |||
{ 3, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 7, 0, 0}, | |||
{ 2, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 7, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 7, 0}, | |||
{ 6, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 6, 7, 0, 0, 0, 0}, | |||
{ 2, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 6, 7, 0, 0, 0}, | |||
{ 3, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 6, 7, 0, 0, 0}, | |||
{ 2, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 6, 7, 0, 0}, | |||
{ 4, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 6, 7, 0, 0, 0}, | |||
{ 2, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 6, 7, 0, 0}, | |||
{ 3, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 1, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 6, 7, 0, 0}, | |||
{ 2, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 6, 7, 0, 0}, | |||
{ 1, 2, 3, 4, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 6, 7, 0}, | |||
{ 5, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 6, 7, 0, 0, 0}, | |||
{ 2, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 6, 7, 0, 0}, | |||
{ 3, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 6, 7, 0, 0}, | |||
{ 2, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 6, 7, 0, 0}, | |||
{ 1, 2, 3, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 6, 7, 0}, | |||
{ 4, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 6, 7, 0, 0}, | |||
{ 2, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 6, 7, 0, 0}, | |||
{ 1, 2, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 6, 7, 0}, | |||
{ 3, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 1, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 6, 7, 0}, | |||
{ 2, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 6, 7, 0}, | |||
{ 1, 2, 3, 4, 5, 6, 7, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 6, 7} | |||
}; | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i d, tmp; | |||
const __m256i bound = _mm256_set1_epi32(Q); | |||
const __m256i mask = _mm256_set1_epi32(0x7FFFFF); | |||
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, | |||
-1, 9, 8, 7, -1, 6, 5, 4, | |||
-1, 11, 10, 9, -1, 8, 7, 6, | |||
-1, 5, 4, 3, -1, 2, 1, 0); | |||
ctr = pos = 0; | |||
while (pos <= REJ_UNIFORM_BUFLEN - 24) { | |||
d = _mm256_loadu_si256((__m256i *)&buf[pos]); | |||
d = _mm256_permute4x64_epi64(d, 0x94); | |||
d = _mm256_shuffle_epi8(d, idx8); | |||
d = _mm256_and_si256(d, mask); | |||
pos += 24; | |||
tmp = _mm256_sub_epi32(d, bound); | |||
good = _mm256_movemask_ps((__m256)tmp); | |||
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good])); | |||
d = _mm256_permutevar8x32_epi32(d, tmp); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], d); | |||
ctr += _mm_popcnt_u32(good); | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
} | |||
uint32_t t; | |||
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) { | |||
t = buf[pos++]; | |||
t |= (uint32_t)buf[pos++] << 8; | |||
t |= (uint32_t)buf[pos++] << 16; | |||
t &= 0x7FFFFF; | |||
if (t < Q) { | |||
r[ctr++] = t; | |||
} | |||
} | |||
return ctr; | |||
} | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i f0, f1; | |||
__m128i g0, g1; | |||
const __m256i mask = _mm256_set1_epi8(15); | |||
const __m256i eta = _mm256_set1_epi8(4); | |||
const __m256i bound = _mm256_set1_epi8(9); | |||
ctr = pos = 0; | |||
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { | |||
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); | |||
f1 = _mm256_slli_epi16(f0, 4); | |||
f0 = _mm256_or_si256(f0, f1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_sub_epi8(f0, bound); | |||
f0 = _mm256_sub_epi8(eta, f0); | |||
good = _mm256_movemask_epi8(f1); | |||
g0 = _mm256_castsi256_si128(f0); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm256_extracti128_si256(f0, 1); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 4; | |||
} | |||
uint32_t t0, t1; | |||
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (t0 < 9) { | |||
r[ctr++] = 4 - t0; | |||
} | |||
if (t1 < 9 && ctr < N) { | |||
r[ctr++] = 4 - t1; | |||
} | |||
} | |||
return ctr; | |||
} |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_REJSAMPLE_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_REJSAMPLE_H | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_NBLOCKS ((228+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) | |||
extern const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8]; | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); | |||
#endif |
@@ -0,0 +1,154 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "rounding.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define _mm256_blendv_epi32(a,b,mask) \ | |||
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ | |||
_mm256_castsi256_ps(b), \ | |||
_mm256_castsi256_ps(mask))) | |||
/************************************************* | |||
* Name: power2round | |||
* | |||
* Description: For finite field elements a, compute a0, a1 such that | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be positive standard representative. | |||
* | |||
* Arguments: - __m256i *a1: output array of length N/8 with high bits | |||
* - __m256i *a0: output array of length N/8 with low bits a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1; | |||
const __m256i mask = _mm256_set1_epi32(-(1 << D)); | |||
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); | |||
for (i = 0; i < N / 8; ++i) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, half); | |||
f0 = _mm256_and_si256(f1, mask); | |||
f1 = _mm256_srli_epi32(f1, D); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: decompose | |||
* | |||
* Description: For finite field element a, compute high and low parts a0, a1 such | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard | |||
* representative. | |||
* | |||
* Arguments: - __m256i *a1: output array of length N/8 with high parts | |||
* - __m256i *a0: output array of length N/8 with low parts a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3_AVX2_qdata.vec[_8XQ / 8]); | |||
const __m256i hq = _mm256_srli_epi32(q, 1); | |||
const __m256i v = _mm256_set1_epi32(1025); | |||
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); | |||
const __m256i off = _mm256_set1_epi32(127); | |||
const __m256i shift = _mm256_set1_epi32(512); | |||
const __m256i mask = _mm256_set1_epi32(15); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, off); | |||
f1 = _mm256_srli_epi32(f1, 7); | |||
f1 = _mm256_mulhi_epu16(f1, v); | |||
f1 = _mm256_mulhrs_epi16(f1, shift); | |||
f1 = _mm256_and_si256(f1, mask); | |||
f0 = _mm256_mullo_epi32(f1, alpha); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
f = _mm256_cmpgt_epi32(f0, hq); | |||
f = _mm256_and_si256(f, q); | |||
f0 = _mm256_sub_epi32(f0, f); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: make_hint | |||
* | |||
* Description: Compute indices of polynomial coefficients whose low bits | |||
* overflow into the high bits. | |||
* | |||
* Arguments: - uint8_t *hint: hint array | |||
* - const __m256i *a0: low bits of input elements | |||
* - const __m256i *a1: high bits of input elements | |||
* | |||
* Returns number of overflowing low bits | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { | |||
unsigned int i, n = 0; | |||
__m256i f0, f1, g0, g1; | |||
uint32_t bad; | |||
uint64_t idx; | |||
const __m256i low = _mm256_set1_epi32(-GAMMA2); | |||
const __m256i high = _mm256_set1_epi32(GAMMA2); | |||
for (i = 0; i < N / 8; ++i) { | |||
f0 = _mm256_load_si256(&a0[i]); | |||
f1 = _mm256_load_si256(&a1[i]); | |||
g0 = _mm256_abs_epi32(f0); | |||
g0 = _mm256_cmpgt_epi32(g0, high); | |||
g1 = _mm256_cmpeq_epi32(f0, low); | |||
g1 = _mm256_sign_epi32(g1, f1); | |||
g0 = _mm256_or_si256(g0, g1); | |||
bad = _mm256_movemask_ps((__m256)g0); | |||
memcpy(&idx, PQCLEAN_DILITHIUM3_AVX2_idxlut[bad], 8); | |||
idx += (uint64_t)0x0808080808080808 * i; | |||
memcpy(&hint[n], &idx, 8); | |||
n += _mm_popcnt_u32(bad); | |||
} | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: use_hint | |||
* | |||
* Description: Correct high parts according to hint. | |||
* | |||
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts | |||
* - const __m256i *a: input array of length N/8 | |||
* - const __m256i *a: input array of length N/8 with hint bits | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { | |||
unsigned int i; | |||
__m256i a0[N / 8]; | |||
__m256i f, g, h, t; | |||
const __m256i zero = _mm256_setzero_si256(); | |||
const __m256i mask = _mm256_set1_epi32(15); | |||
PQCLEAN_DILITHIUM3_AVX2_decompose_avx(b, a0, a); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a0[i]); | |||
g = _mm256_load_si256(&b[i]); | |||
h = _mm256_load_si256(&hint[i]); | |||
t = _mm256_blendv_epi32(zero, h, f); | |||
t = _mm256_slli_epi32(t, 1); | |||
h = _mm256_sub_epi32(h, t); | |||
g = _mm256_add_epi32(g, h); | |||
g = _mm256_and_si256(g, mask); | |||
_mm256_store_si256(&b[i], g); | |||
} | |||
} |
@@ -0,0 +1,12 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_ROUNDING_H | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
void PQCLEAN_DILITHIUM3_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); | |||
void PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); | |||
#endif |
@@ -0,0 +1,54 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.text | |||
nttunpack128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
#store | |||
vmovdqa %ymm9,(%rdi) | |||
vmovdqa %ymm8,32(%rdi) | |||
vmovdqa %ymm7,64(%rdi) | |||
vmovdqa %ymm6,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm3,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx): | |||
_cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx): | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
ret |
@@ -0,0 +1,25 @@ | |||
.macro shuffle8 r0,r1,r2,r3 | |||
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle4 r0,r1,r2,r3 | |||
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 | |||
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vpsllq $32,%ymm\r1,%ymm\r2 | |||
vmovsldup %ymm\r1,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrlq $32,%ymm\r0,%ymm\r0 | |||
#vmovshdup %ymm\r0,%ymm\r0 | |||
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle1 r0,r1,r2,r3 | |||
vpslld $16,%ymm\r1,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrld $16,%ymm\r0,%ymm\r0 | |||
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm |
@@ -0,0 +1,425 @@ | |||
#include "align.h" | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) { | |||
switch (i) { | |||
case 0: | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 1: | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
case 2: | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 3: | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
case 4: | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 5: | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair | |||
* | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
unsigned int i; | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
polyvecl rowbuf[2]; | |||
polyvecl s1, *row = rowbuf; | |||
polyveck s2; | |||
poly t1, t0; | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Store rho, key */ | |||
memcpy(pk, rho, SEEDBYTES); | |||
memcpy(sk, rho, SEEDBYTES); | |||
memcpy(sk + SEEDBYTES, key, SEEDBYTES); | |||
/* Sample short vectors s1 and s2 */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[4], &s2.vec[0], &s2.vec[1], &s2.vec[2], rhoprime, 4, 5, 6, 7); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s2.vec[3], &s2.vec[4], &s2.vec[5], &t0, rhoprime, 8, 9, 10, 11); | |||
/* Pack secret vectors */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); | |||
} | |||
for (i = 0; i < K; i++) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); | |||
} | |||
/* Transform s1 */ | |||
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1); | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
polyvec_matrix_expand_row(&row, rowbuf, rho, i); | |||
/* Compute inner-product */ | |||
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&t1); | |||
/* Add error polynomial */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_add(&t1, &t1, &s2.vec[i]); | |||
/* Round t and pack t1, t0 */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&t1); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); | |||
PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); | |||
} | |||
/* Compute CRH(rho, t1) and store in secret key */ | |||
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature | |||
* | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
unsigned int i, n, pos; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint8_t hintbuf[N]; | |||
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
uint64_t nonce = 0; | |||
polyvecl mat[K], s1, z; | |||
polyveck t0, s2, w1; | |||
poly c, tmp; | |||
union { | |||
polyvecl y; | |||
polyveck w0; | |||
} tmpv; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM3_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
/* Compute CRH(tr, msg) */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3], | |||
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(&z.vec[4], rhoprime, nonce + 4); | |||
nonce += 5; | |||
/* Matrix-vector product */ | |||
tmpv.y = z; | |||
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&tmpv.y); | |||
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); | |||
PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); | |||
PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(sig, &w1); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&c); | |||
/* Compute z, reject if it reveals secret */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&z.vec[i]); | |||
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
} | |||
/* Zero hint vector in signature */ | |||
pos = 0; | |||
memset(hint, 0, OMEGA); | |||
for (i = 0; i < K; i++) { | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&tmpv.w0.vec[i]); | |||
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&tmp); | |||
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&tmp, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM3_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
n = PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); | |||
if (pos + n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Store hints in signature */ | |||
memcpy(&hint[pos], hintbuf, n); | |||
hint[OMEGA + i] = pos = pos + n; | |||
} | |||
/* Pack z into signature */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); | |||
} | |||
*siglen = PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign | |||
* | |||
* Description: Compute signed message. | |||
* | |||
* Arguments: - uint8_t *sm: pointer to output signed message (allocated | |||
* array with PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify | |||
* | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
unsigned int i, j, pos = 0; | |||
/* PQCLEAN_DILITHIUM3_AVX2_polyw1_pack writes additional 14 bytes */ | |||
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; | |||
uint8_t mu[CRHBYTES]; | |||
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
polyvecl rowbuf[2]; | |||
polyvecl *row = rowbuf; | |||
polyvecl z; | |||
poly c, w1, h; | |||
shake256incctx state; | |||
if (siglen != PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
/* Expand PQCLEAN_DILITHIUM3_AVX2_challenge */ | |||
PQCLEAN_DILITHIUM3_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&c); | |||
/* Unpack z; shortness follows from unpacking */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&z.vec[i]); | |||
} | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
polyvec_matrix_expand_row(&row, rowbuf, pk, i); | |||
/* Compute i-th row of Az - c2^Dt1 */ | |||
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); | |||
PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&h); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&h); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&h, &c, &h); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&w1); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&w1); | |||
/* Get hint polynomial and reconstruct w1 */ | |||
memset(h.vec, 0, sizeof(poly)); | |||
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { | |||
return -1; | |||
} | |||
for (j = pos; j < hint[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > pos && hint[j] <= hint[j - 1]) { | |||
return -1; | |||
} | |||
h.coeffs[hint[j]] = 1; | |||
} | |||
pos = hint[OMEGA + i]; | |||
PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&w1); | |||
PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = pos; j < OMEGA; ++j) { | |||
if (hint[j]) { | |||
return -1; | |||
} | |||
} | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM3_AVX2_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (buf.coeffs[i] != sig[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open | |||
* | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
badsig: | |||
/* Signature verification failed */ | |||
*mlen = -1; | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
return -1; | |||
} |
@@ -0,0 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_SIGN_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_SIGN_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,26 @@ | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake128_inc_init(state); | |||
shake128_inc_absorb(state, seed, SEEDBYTES); | |||
shake128_inc_absorb(state, t, 2); | |||
shake128_inc_finalize(state); | |||
} | |||
void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake256_inc_init(state); | |||
shake256_inc_absorb(state, seed, CRHBYTES); | |||
shake256_inc_absorb(state, t, 2); | |||
shake256_inc_finalize(state); | |||
} |
@@ -0,0 +1,36 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H | |||
#define PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef shake128incctx stream128_state; | |||
typedef shake256incctx stream256_state; | |||
void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(shake128incctx *state, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(shake256incctx *state, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
#define STREAM128_BLOCKBYTES SHAKE128_RATE | |||
#define STREAM256_BLOCKBYTES SHAKE256_RATE | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) | |||
#define stream128_release(STATE) shake128_inc_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) | |||
#define stream256_release(STATE) shake256_inc_ctx_release(STATE) | |||
#endif |
@@ -0,0 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -0,0 +1,23 @@ | |||
# This Makefile can be used with Microsoft Visual Studio's nmake using the command: | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libdilithium3_clean.lib | |||
OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
# long as there has been a C standard, and we need it for constant-time | |||
# computations. Thus, we disable that spurious warning. | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 | |||
all: $(LIBRARY) | |||
# Make sure objects are recompiled if headers change. | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
-DEL $(LIBRARY) |
@@ -0,0 +1,32 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_API_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES 1952 | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES 4016 | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES 3293 | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3" | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,98 @@ | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
static const int32_t zetas[N] = { | |||
0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, | |||
2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, | |||
-2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, | |||
-1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, | |||
3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, | |||
-671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, | |||
-3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, | |||
-3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, | |||
189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, | |||
1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, | |||
2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, | |||
266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, | |||
900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, | |||
-655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, | |||
342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, | |||
2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, | |||
-3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, | |||
-1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, | |||
-1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, | |||
-542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, | |||
-2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, | |||
-3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, | |||
-3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, | |||
-426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, | |||
-2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, | |||
-554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 | |||
}; | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_ntt | |||
* | |||
* Description: Forward NTT, in-place. No modular reduction is performed after | |||
* additions or subtractions. Output vector is in bitreversed order. | |||
* | |||
* Arguments: - uint32_t p[N]: input/output coefficient array | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_ntt(int32_t a[N]) { | |||
unsigned int len, start, j, k; | |||
int32_t zeta, t; | |||
k = 0; | |||
for (len = 128; len > 0; len >>= 1) { | |||
for (start = 0; start < N; start = j + len) { | |||
zeta = zetas[++k]; | |||
for (j = start; j < start + len; ++j) { | |||
t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); | |||
a[j + len] = a[j] - t; | |||
a[j] = a[j] + t; | |||
} | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by Montgomery factor 2^32. | |||
* In-place. No modular reductions after additions or | |||
* subtractions; input coefficients need to be smaller than | |||
* Q in absolute value. Output coefficient are smaller than Q in | |||
* absolute value. | |||
* | |||
* Arguments: - uint32_t p[N]: input/output coefficient array | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(int32_t a[N]) { | |||
unsigned int start, len, j, k; | |||
int32_t t, zeta; | |||
const int32_t f = 41978; // mont^2/256 | |||
k = 256; | |||
for (len = 1; len < N; len <<= 1) { | |||
for (start = 0; start < N; start = j + len) { | |||
zeta = -zetas[--k]; | |||
for (j = start; j < start + len; ++j) { | |||
t = a[j]; | |||
a[j] = t + a[j + len]; | |||
a[j + len] = t - a[j + len]; | |||
a[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); | |||
} | |||
} | |||
} | |||
for (j = 0; j < N; ++j) { | |||
a[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)f * a[j]); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_NTT_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_NTT_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_CLEAN_ntt(int32_t a[N]); | |||
void PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(int32_t a[N]); | |||
#endif |
@@ -0,0 +1,261 @@ | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_pk | |||
* | |||
* Description: Bit-pack public key pk = (rho, t1). | |||
* | |||
* Arguments: - uint8_t pk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_pk | |||
* | |||
* Description: Unpack public key pk = (rho, t1). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3_CLEAN_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3_CLEAN_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { | |||
return 1; | |||
} | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
} | |||
h->vec[i].coeffs[sig[j]] = 1; | |||
} | |||
k = sig[OMEGA + i]; | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_PACKING_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_PACKING_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]); | |||
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]); | |||
#endif |
@@ -0,0 +1,41 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 6 | |||
#define L 5 | |||
#define ETA 4 | |||
#define TAU 49 | |||
#define BETA 196 | |||
#define GAMMA1 (1 << 19) | |||
#define GAMMA2 ((Q-1)/32) | |||
#define OMEGA 55 | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 640 | |||
#define POLYW1_PACKEDBYTES 128 | |||
#define POLYETA_PACKEDBYTES 128 | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -0,0 +1,818 @@ | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "reduce.h" | |||
#include "rounding.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#define DBENCH_START() | |||
#define DBENCH_STOP(t) | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_reduce | |||
* | |||
* Description: Inplace reduction of all coefficients of polynomial to | |||
* representative in [-6283009,6283007]. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_caddq | |||
* | |||
* Description: For all coefficients of in/out polynomial add Q if | |||
* coefficient is negative. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_caddq(a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_freeze | |||
* | |||
* Description: Inplace reduction of all coefficients of polynomial to | |||
* standard representatives. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_freeze(a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_add | |||
* | |||
* Description: Add polynomials. No modular reduction is performed. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first summand | |||
* - const poly *b: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; | |||
} | |||
DBENCH_STOP(*tadd); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_sub | |||
* | |||
* Description: Subtract polynomials. No modular reduction is | |||
* performed. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial to be | |||
* subtraced from first input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; | |||
} | |||
DBENCH_STOP(*tadd); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl | |||
* | |||
* Description: Multiply polynomial by 2^D without modular reduction. Assumes | |||
* input coefficients to be less than 2^{31-D} in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a->coeffs[i] <<= D; | |||
} | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_ntt | |||
* | |||
* Description: Inplace forward NTT. Coefficients can grow by | |||
* 8*Q in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM3_CLEAN_ntt(a->coeffs); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont | |||
* | |||
* Description: Inplace inverse NTT and multiplication by 2^{32}. | |||
* Input coefficients need to be less than Q in absolute | |||
* value and output coefficients are again bounded by Q. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(a->coeffs); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery | |||
* | |||
* Description: Pointwise multiplication of polynomials in NTT domain | |||
* representation and multiplication of resulting polynomial | |||
* by 2^{-32}. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_power2round | |||
* | |||
* Description: For all coefficients c of the input polynomial, | |||
* compute c0, c1 such that c mod Q = c1*2^D + c0 | |||
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 | |||
* - poly *a0: pointer to output polynomial with coefficients c0 | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_decompose | |||
* | |||
* Description: For all coefficients c of the input polynomial, | |||
* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 | |||
* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we | |||
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 | |||
* - poly *a0: pointer to output polynomial with coefficients c0 | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint | |||
* | |||
* Description: Compute hint polynomial. The coefficients of which indicate | |||
* whether the low bits of the corresponding coefficient of | |||
* the input polynomial overflow into the high bits. | |||
* | |||
* Arguments: - poly *h: pointer to output hint polynomial | |||
* - const poly *a0: pointer to low part of input polynomial | |||
* - const poly *a1: pointer to high part of input polynomial | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { | |||
unsigned int i, s = 0; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
h->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); | |||
s += h->coeffs[i]; | |||
} | |||
DBENCH_STOP(*tround); | |||
return s; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint | |||
* | |||
* Description: Use hint polynomial to correct the high bits of a polynomial. | |||
* | |||
* Arguments: - poly *b: pointer to output polynomial with corrected high bits | |||
* - const poly *a: pointer to input polynomial | |||
* - const poly *h: pointer to input hint polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N; ++i) { | |||
b->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); | |||
} | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm | |||
* | |||
* Description: Check infinity norm of polynomial against given bound. | |||
* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM3_CLEAN_reduce32(). | |||
* | |||
* Arguments: - const poly *a: pointer to polynomial | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, int32_t B) { | |||
unsigned int i; | |||
int32_t t; | |||
DBENCH_START(); | |||
if (B > (Q - 1) / 8) { | |||
return 1; | |||
} | |||
/* It is ok to leak which coefficient violates the bound since | |||
the probability for each coefficient is independent of secret | |||
data but we must not leak the sign of the centralized representative. */ | |||
for (i = 0; i < N; ++i) { | |||
/* Absolute value */ | |||
t = a->coeffs[i] >> 31; | |||
t = a->coeffs[i] - (t & 2 * a->coeffs[i]); | |||
if (t >= B) { | |||
DBENCH_STOP(*tsample); | |||
return 1; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: rej_uniform | |||
* | |||
* Description: Sample uniformly random coefficients in [0, Q-1] by | |||
* performing rejection sampling on array of random bytes. | |||
* | |||
* Arguments: - int32_t *a: pointer to output array (allocated) | |||
* - unsigned int len: number of coefficients to be sampled | |||
* - const uint8_t *buf: array of random bytes | |||
* - unsigned int buflen: length of array of random bytes | |||
* | |||
* Returns number of sampled coefficients. Can be smaller than len if not enough | |||
* random bytes were given. | |||
**************************************************/ | |||
static unsigned int rej_uniform(int32_t *a, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint32_t t; | |||
DBENCH_START(); | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
t = buf[pos++]; | |||
t |= (uint32_t)buf[pos++] << 8; | |||
t |= (uint32_t)buf[pos++] << 16; | |||
t &= 0x7FFFFF; | |||
if (t < Q) { | |||
a[ctr++] = t; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return ctr; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [0,Q-1] by performing rejection sampling on the | |||
* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES | |||
* - uint16_t nonce: 2-byte nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce) { | |||
unsigned int i, ctr, off; | |||
unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; | |||
uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; | |||
stream128_state state; | |||
stream128_init(&state, seed, nonce); | |||
stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); | |||
ctr = rej_uniform(a->coeffs, N, buf, buflen); | |||
while (ctr < N) { | |||
off = buflen % 3; | |||
for (i = 0; i < off; ++i) { | |||
buf[i] = buf[buflen - off + i]; | |||
} | |||
stream128_squeezeblocks(buf + off, 1, &state); | |||
buflen = STREAM128_BLOCKBYTES + off; | |||
ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); | |||
} | |||
stream128_release(&state); | |||
} | |||
/************************************************* | |||
* Name: rej_eta | |||
* | |||
* Description: Sample uniformly random coefficients in [-ETA, ETA] by | |||
* performing rejection sampling on array of random bytes. | |||
* | |||
* Arguments: - int32_t *a: pointer to output array (allocated) | |||
* - unsigned int len: number of coefficients to be sampled | |||
* - const uint8_t *buf: array of random bytes | |||
* - unsigned int buflen: length of array of random bytes | |||
* | |||
* Returns number of sampled coefficients. Can be smaller than len if not enough | |||
* random bytes were given. | |||
**************************************************/ | |||
static unsigned int rej_eta(int32_t *a, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint32_t t0, t1; | |||
DBENCH_START(); | |||
ctr = pos = 0; | |||
while (ctr < len && pos < buflen) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (t0 < 9) { | |||
a[ctr++] = 4 - t0; | |||
} | |||
if (t1 < 9 && ctr < len) { | |||
a[ctr++] = 4 - t1; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return ctr; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [-ETA,ETA] by performing rejection sampling on the | |||
* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES | |||
* - uint16_t nonce: 2-byte nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce) { | |||
unsigned int ctr; | |||
unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; | |||
uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; | |||
stream128_state state; | |||
stream128_init(&state, seed, nonce); | |||
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); | |||
ctr = rej_eta(a->coeffs, N, buf, buflen); | |||
while (ctr < N) { | |||
stream128_squeezeblocks(buf, 1, &state); | |||
ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); | |||
} | |||
stream128_release(&state); | |||
} | |||
/************************************************* | |||
* Name: poly_uniform_gamma1m1 | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream | |||
* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length CRHBYTES | |||
* - uint16_t nonce: 16-bit nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(poly *a, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce) { | |||
uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; | |||
stream256_state state; | |||
stream256_init(&state, seed, nonce); | |||
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); | |||
stream256_release(&state); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(a, buf); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_challenge | |||
* | |||
* Description: Implementation of H. Samples polynomial with TAU nonzero | |||
* coefficients in {-1,1} using the output stream of | |||
* SHAKE256(seed). | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { | |||
unsigned int i, b, pos; | |||
uint64_t signs; | |||
uint8_t buf[SHAKE256_RATE]; | |||
shake256incctx state; | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, seed, SEEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf, sizeof buf, &state); | |||
signs = 0; | |||
for (i = 0; i < 8; ++i) { | |||
signs |= (uint64_t)buf[i] << 8 * i; | |||
} | |||
pos = 8; | |||
for (i = 0; i < N; ++i) { | |||
c->coeffs[i] = 0; | |||
} | |||
for (i = N - TAU; i < N; ++i) { | |||
do { | |||
if (pos >= SHAKE256_RATE) { | |||
shake256_inc_squeeze(buf, sizeof buf, &state); | |||
pos = 0; | |||
} | |||
b = buf[pos++]; | |||
} while (b > i); | |||
c->coeffs[i] = c->coeffs[b]; | |||
c->coeffs[b] = 1 - 2 * (signs & 1); | |||
signs >>= 1; | |||
} | |||
shake256_inc_ctx_release(&state); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack | |||
* | |||
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYETA_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
uint8_t t[8]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 2; ++i) { | |||
t[0] = (uint8_t) (ETA - a->coeffs[2 * i + 0]); | |||
t[1] = (uint8_t) (ETA - a->coeffs[2 * i + 1]); | |||
r[i] = t[0] | (t[1] << 4); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack | |||
* | |||
* Description: Unpack polynomial with coefficients in [-ETA,ETA]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 2; ++i) { | |||
r->coeffs[2 * i + 0] = a[i] & 0x0F; | |||
r->coeffs[2 * i + 1] = a[i] >> 4; | |||
r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; | |||
r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack | |||
* | |||
* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. | |||
* Input coefficients are assumed to be standard representatives. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYT1_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); | |||
r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); | |||
r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); | |||
r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); | |||
r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack | |||
* | |||
* Description: Unpack polynomial t1 with 10-bit coefficients. | |||
* Output coefficients are standard representatives. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; | |||
r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; | |||
r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; | |||
r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack | |||
* | |||
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYT0_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
uint32_t t[8]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; | |||
t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; | |||
t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; | |||
t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; | |||
t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; | |||
t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; | |||
t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; | |||
t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; | |||
r[13 * i + 0] = (uint8_t) t[0]; | |||
r[13 * i + 1] = (uint8_t) (t[0] >> 8); | |||
r[13 * i + 1] |= (uint8_t) (t[1] << 5); | |||
r[13 * i + 2] = (uint8_t) (t[1] >> 3); | |||
r[13 * i + 3] = (uint8_t) (t[1] >> 11); | |||
r[13 * i + 3] |= (uint8_t) (t[2] << 2); | |||
r[13 * i + 4] = (uint8_t) (t[2] >> 6); | |||
r[13 * i + 4] |= (uint8_t) (t[3] << 7); | |||
r[13 * i + 5] = (uint8_t) (t[3] >> 1); | |||
r[13 * i + 6] = (uint8_t) (t[3] >> 9); | |||
r[13 * i + 6] |= (uint8_t) (t[4] << 4); | |||
r[13 * i + 7] = (uint8_t) (t[4] >> 4); | |||
r[13 * i + 8] = (uint8_t) (t[4] >> 12); | |||
r[13 * i + 8] |= (uint8_t) (t[5] << 1); | |||
r[13 * i + 9] = (uint8_t) (t[5] >> 7); | |||
r[13 * i + 9] |= (uint8_t) (t[6] << 6); | |||
r[13 * i + 10] = (uint8_t) (t[6] >> 2); | |||
r[13 * i + 11] = (uint8_t) (t[6] >> 10); | |||
r[13 * i + 11] |= (uint8_t) (t[7] << 3); | |||
r[13 * i + 12] = (uint8_t) (t[7] >> 5); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack | |||
* | |||
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
r->coeffs[8 * i + 0] = a[13 * i + 0]; | |||
r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; | |||
r->coeffs[8 * i + 0] &= 0x1FFF; | |||
r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; | |||
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; | |||
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; | |||
r->coeffs[8 * i + 1] &= 0x1FFF; | |||
r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; | |||
r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; | |||
r->coeffs[8 * i + 2] &= 0x1FFF; | |||
r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; | |||
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; | |||
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; | |||
r->coeffs[8 * i + 3] &= 0x1FFF; | |||
r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; | |||
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; | |||
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; | |||
r->coeffs[8 * i + 4] &= 0x1FFF; | |||
r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; | |||
r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; | |||
r->coeffs[8 * i + 5] &= 0x1FFF; | |||
r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; | |||
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; | |||
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; | |||
r->coeffs[8 * i + 6] &= 0x1FFF; | |||
r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; | |||
r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; | |||
r->coeffs[8 * i + 7] &= 0x1FFF; | |||
r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; | |||
r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; | |||
r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; | |||
r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; | |||
r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; | |||
r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; | |||
r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; | |||
r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyz_pack | |||
* | |||
* Description: Bit-pack polynomial with coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYZ_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
uint32_t t[4]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 2; ++i) { | |||
t[0] = GAMMA1 - a->coeffs[2 * i + 0]; | |||
t[1] = GAMMA1 - a->coeffs[2 * i + 1]; | |||
r[5 * i + 0] = (uint8_t) t[0]; | |||
r[5 * i + 1] = (uint8_t) (t[0] >> 8); | |||
r[5 * i + 2] = (uint8_t) (t[0] >> 16); | |||
r[5 * i + 2] |= (uint8_t) (t[1] << 4); | |||
r[5 * i + 3] = (uint8_t) (t[1] >> 4); | |||
r[5 * i + 4] = (uint8_t) (t[1] >> 12); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack | |||
* | |||
* Description: Unpack polynomial z with coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 2; ++i) { | |||
r->coeffs[2 * i + 0] = a[5 * i + 0]; | |||
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; | |||
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; | |||
r->coeffs[2 * i + 0] &= 0xFFFFF; | |||
r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; | |||
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; | |||
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; | |||
r->coeffs[2 * i + 0] &= 0xFFFFF; | |||
r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; | |||
r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack | |||
* | |||
* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. | |||
* Input coefficients are assumed to be standard representatives. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYW1_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 2; ++i) { | |||
r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} |
@@ -0,0 +1,53 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLY_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_POLY_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef struct { | |||
int32_t coeffs[N]; | |||
} poly; | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(poly *a, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a); | |||
#endif |
@@ -0,0 +1,448 @@ | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
unsigned int i, j; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < L; ++j) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length L | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add | |||
* | |||
* Description: Add vectors of polynomials of length L. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyvecl *w: pointer to output vector | |||
* - const polyvecl *u: pointer to first summand | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length L. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v) { | |||
unsigned int i; | |||
poly t; | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); | |||
for (i = 1; i < L; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_add(w, w, &t); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_add | |||
* | |||
* Description: Add vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first summand | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
* - const polyveck *v: pointer to second input vector to be | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length K. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
* than 2*Q. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint | |||
* | |||
* Description: Compute hint vector. | |||
* | |||
* Arguments: - polyveck *h: pointer to output vector | |||
* - const polyveck *v0: pointer to low part of input vector | |||
* - const polyveck *v1: pointer to high part of input vector | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1) { | |||
unsigned int i, s = 0; | |||
for (i = 0; i < K; ++i) { | |||
s += PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return s; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint | |||
* | |||
* Description: Use hint vector to correct the high bits of input vector. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -0,0 +1,68 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
/* Vectors of polynomials of length K */ | |||
typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -0,0 +1,69 @@ | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce | |||
* | |||
* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, | |||
* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. | |||
* | |||
* Arguments: - int64_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(int64_t a) { | |||
int32_t t; | |||
t = (int32_t)((uint64_t)a * (uint64_t)QINV); | |||
t = (a - (int64_t)t * Q) >> 32; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_reduce32 | |||
* | |||
* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, | |||
* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. | |||
* | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(int32_t a) { | |||
int32_t t; | |||
t = (a + (1 << 22)) >> 23; | |||
t = a - t * Q; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_caddq | |||
* | |||
* Description: Add Q if input coefficient is negative. | |||
* | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_caddq(int32_t a) { | |||
a += (a >> 31) & Q; | |||
return a; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_freeze | |||
* | |||
* Description: For finite field element a, compute standard | |||
* representative r = a mod^+ Q. | |||
* | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(int32_t a) { | |||
a = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a); | |||
a = PQCLEAN_DILITHIUM3_CLEAN_caddq(a); | |||
return a; | |||
} |
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define MONT (-4186625) // 2^32 % Q | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(int64_t a); | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(int32_t a); | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_caddq(int32_t a); | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(int32_t a); | |||
#endif |
@@ -0,0 +1,92 @@ | |||
#include "params.h" | |||
#include "rounding.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_power2round | |||
* | |||
* Description: For finite field element a, compute a0, a1 such that | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be standard representative. | |||
* | |||
* Arguments: - int32_t a: input element | |||
* - int32_t *a0: pointer to output element a0 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(int32_t *a0, int32_t a) { | |||
int32_t a1; | |||
a1 = (a + (1 << (D - 1)) - 1) >> D; | |||
*a0 = a - (a1 << D); | |||
return a1; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_decompose | |||
* | |||
* Description: For finite field element a, compute high and low bits a0, a1 such | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard | |||
* representative. | |||
* | |||
* Arguments: - int32_t a: input element | |||
* - int32_t *a0: pointer to output element a0 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(int32_t *a0, int32_t a) { | |||
int32_t a1; | |||
a1 = (a + 127) >> 7; | |||
a1 = (a1 * 1025 + (1 << 21)) >> 22; | |||
a1 &= 15; | |||
*a0 = a - a1 * 2 * GAMMA2; | |||
*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; | |||
return a1; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_make_hint | |||
* | |||
* Description: Compute hint bit indicating whether the low bits of the | |||
* input element overflow into the high bits. | |||
* | |||
* Arguments: - int32_t a0: low bits of input element | |||
* - int32_t a1: high bits of input element | |||
* | |||
* Returns 1 if overflow. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(int32_t a0, int32_t a1) { | |||
if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_use_hint | |||
* | |||
* Description: Correct high bits according to hint. | |||
* | |||
* Arguments: - int32_t a: input element | |||
* - unsigned int hint: hint bit | |||
* | |||
* Returns corrected high bits. | |||
**************************************************/ | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(int32_t a, unsigned int hint) { | |||
int32_t a0, a1; | |||
a1 = PQCLEAN_DILITHIUM3_CLEAN_decompose(&a0, a); | |||
if (hint == 0) { | |||
return a1; | |||
} | |||
if (a0 > 0) { | |||
return (a1 + 1) & 15; | |||
} | |||
return (a1 - 1) & 15; | |||
} |
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(int32_t *a0, int32_t a); | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(int32_t *a0, int32_t a); | |||
unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(int32_t a0, int32_t a1); | |||
int32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(int32_t a, unsigned int hint); | |||
#endif |
@@ -0,0 +1,343 @@ | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair | |||
* | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
uint8_t tr[CRHBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
polyvecl mat[K]; | |||
polyvecl s1, s1hat; | |||
polyveck s2, t1, t0; | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Expand matrix */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho); | |||
/* Sample short vectors s1 and s2 */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); | |||
/* Matrix-vector multiplication */ | |||
s1hat = s1; | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&s1hat); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&t1); | |||
/* Add error vector s2 */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&t1, &t1, &s2); | |||
/* Extract t1 and write public key */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_pack_pk(pk, rho, &t1); | |||
/* Compute CRH(rho, t1) and write secret key */ | |||
crh(tr, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES); | |||
PQCLEAN_DILITHIUM3_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature | |||
* | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(uint8_t *sig, | |||
size_t *siglen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *sk) { | |||
unsigned int n; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint16_t nonce = 0; | |||
polyvecl mat[K], s1, y, z; | |||
polyveck t0, s2, w1, w0, h; | |||
poly cp; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
/* Compute CRH(tr, msg) */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&t0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); | |||
/* Matrix-vector multiplication */ | |||
z = y; | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&z); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(&w1, &w0, &w1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(sig, &w1); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(&cp, sig); | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&cp); | |||
/* Compute z, reject if it reveals secret */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(&z); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(&z, &z, &y); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(&z); | |||
if (PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&h); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w0, &w0, &h); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w0); | |||
if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints for w1 */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&h); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&h); | |||
if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&h, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&w0, &w0, &h); | |||
n = PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(&h, &w0, &w1); | |||
if (n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Write signature */ | |||
PQCLEAN_DILITHIUM3_CLEAN_pack_sig(sig, sig, &z, &h); | |||
*siglen = PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign | |||
* | |||
* Description: Compute signed message. | |||
* | |||
* Arguments: - uint8_t *sm: pointer to output signed message (allocated | |||
* array with PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, | |||
size_t *smlen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify | |||
* | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(const uint8_t *sig, | |||
size_t siglen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *pk) { | |||
unsigned int i; | |||
uint8_t buf[K * POLYW1_PACKEDBYTES]; | |||
uint8_t rho[SEEDBYTES]; | |||
uint8_t mu[CRHBYTES]; | |||
uint8_t c[SEEDBYTES]; | |||
uint8_t c2[SEEDBYTES]; | |||
poly cp; | |||
polyvecl mat[K], z; | |||
polyveck t1, w1, h; | |||
shake256incctx state; | |||
if (siglen != PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(rho, &t1, pk); | |||
if (PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(c, &z, &h, sig)) { | |||
return -1; | |||
} | |||
if (PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
return -1; | |||
} | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
/* Matrix-vector multiplication; compute Az - c2^dt1 */ | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(&cp, c); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&z); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); | |||
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&cp); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(&t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w1, &w1, &t1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&w1); | |||
/* Reconstruct w1 */ | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(buf, &w1); | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM3_CLEAN_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(c2, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (c[i] != c2[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open | |||
* | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, | |||
size_t *mlen, | |||
const uint8_t *sm, | |||
size_t smlen, | |||
const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
badsig: | |||
/* Signature verification failed */ | |||
*mlen = (size_t) -1; | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
return -1; | |||
} |
@@ -0,0 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM3_CLEAN_SIGN_H | |||
#define PQCLEAN_DILITHIUM3_CLEAN_SIGN_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
#endif |