浏览代码

add sign back

kyber
Henry Case 3 年前
父节点
当前提交
d7ee70681a
共有 100 个文件被更改,包括 15471 次插入0 次删除
  1. +31
    -0
      crypto_sign/dilithium/dilithium2/META.yml
  2. +5
    -0
      crypto_sign/dilithium/dilithium2/avx2/LICENSE
  3. +19
    -0
      crypto_sign/dilithium/dilithium2/avx2/align.h
  4. +31
    -0
      crypto_sign/dilithium/dilithium2/avx2/api.h
  5. +24
    -0
      crypto_sign/dilithium/dilithium2/avx2/cdecl.h
  6. +101
    -0
      crypto_sign/dilithium/dilithium2/avx2/consts.c
  7. +10
    -0
      crypto_sign/dilithium/dilithium2/avx2/consts.h
  8. +909
    -0
      crypto_sign/dilithium/dilithium2/avx2/f1600x4.S
  9. +219
    -0
      crypto_sign/dilithium/dilithium2/avx2/fips202x4.c
  10. +64
    -0
      crypto_sign/dilithium/dilithium2/avx2/fips202x4.h
  11. +240
    -0
      crypto_sign/dilithium/dilithium2/avx2/invntt.S
  12. +199
    -0
      crypto_sign/dilithium/dilithium2/avx2/ntt.S
  13. +14
    -0
      crypto_sign/dilithium/dilithium2/avx2/ntt.h
  14. +261
    -0
      crypto_sign/dilithium/dilithium2/avx2/packing.c
  15. +31
    -0
      crypto_sign/dilithium/dilithium2/avx2/packing.h
  16. +41
    -0
      crypto_sign/dilithium/dilithium2/avx2/params.h
  17. +199
    -0
      crypto_sign/dilithium/dilithium2/avx2/pointwise.S
  18. +1027
    -0
      crypto_sign/dilithium/dilithium2/avx2/poly.c
  19. +79
    -0
      crypto_sign/dilithium/dilithium2/avx2/poly.h
  20. +474
    -0
      crypto_sign/dilithium/dilithium2/avx2/polyvec.c
  21. +72
    -0
      crypto_sign/dilithium/dilithium2/avx2/polyvec.h
  22. +408
    -0
      crypto_sign/dilithium/dilithium2/avx2/rejsample.c
  23. +19
    -0
      crypto_sign/dilithium/dilithium2/avx2/rejsample.h
  24. +157
    -0
      crypto_sign/dilithium/dilithium2/avx2/rounding.c
  25. +12
    -0
      crypto_sign/dilithium/dilithium2/avx2/rounding.h
  26. +54
    -0
      crypto_sign/dilithium/dilithium2/avx2/shuffle.S
  27. +25
    -0
      crypto_sign/dilithium/dilithium2/avx2/shuffle.inc
  28. +415
    -0
      crypto_sign/dilithium/dilithium2/avx2/sign.c
  29. +29
    -0
      crypto_sign/dilithium/dilithium2/avx2/sign.h
  30. +26
    -0
      crypto_sign/dilithium/dilithium2/avx2/symmetric-shake.c
  31. +36
    -0
      crypto_sign/dilithium/dilithium2/avx2/symmetric.h
  32. +5
    -0
      crypto_sign/dilithium/dilithium2/clean/LICENSE
  33. +23
    -0
      crypto_sign/dilithium/dilithium2/clean/Makefile.Microsoft_nmake
  34. +31
    -0
      crypto_sign/dilithium/dilithium2/clean/api.h
  35. +98
    -0
      crypto_sign/dilithium/dilithium2/clean/ntt.c
  36. +10
    -0
      crypto_sign/dilithium/dilithium2/clean/ntt.h
  37. +261
    -0
      crypto_sign/dilithium/dilithium2/clean/packing.c
  38. +31
    -0
      crypto_sign/dilithium/dilithium2/clean/packing.h
  39. +41
    -0
      crypto_sign/dilithium/dilithium2/clean/params.h
  40. +867
    -0
      crypto_sign/dilithium/dilithium2/clean/poly.c
  41. +53
    -0
      crypto_sign/dilithium/dilithium2/clean/poly.h
  42. +448
    -0
      crypto_sign/dilithium/dilithium2/clean/polyvec.c
  43. +68
    -0
      crypto_sign/dilithium/dilithium2/clean/polyvec.h
  44. +69
    -0
      crypto_sign/dilithium/dilithium2/clean/reduce.c
  45. +17
    -0
      crypto_sign/dilithium/dilithium2/clean/reduce.h
  46. +98
    -0
      crypto_sign/dilithium/dilithium2/clean/rounding.c
  47. +14
    -0
      crypto_sign/dilithium/dilithium2/clean/rounding.h
  48. +343
    -0
      crypto_sign/dilithium/dilithium2/clean/sign.c
  49. +29
    -0
      crypto_sign/dilithium/dilithium2/clean/sign.h
  50. +26
    -0
      crypto_sign/dilithium/dilithium2/clean/symmetric-shake.c
  51. +36
    -0
      crypto_sign/dilithium/dilithium2/clean/symmetric.h
  52. +31
    -0
      crypto_sign/dilithium/dilithium3/META.yml
  53. +5
    -0
      crypto_sign/dilithium/dilithium3/avx2/LICENSE
  54. +19
    -0
      crypto_sign/dilithium/dilithium3/avx2/align.h
  55. +32
    -0
      crypto_sign/dilithium/dilithium3/avx2/api.h
  56. +24
    -0
      crypto_sign/dilithium/dilithium3/avx2/cdecl.h
  57. +101
    -0
      crypto_sign/dilithium/dilithium3/avx2/consts.c
  58. +10
    -0
      crypto_sign/dilithium/dilithium3/avx2/consts.h
  59. +909
    -0
      crypto_sign/dilithium/dilithium3/avx2/f1600x4.S
  60. +219
    -0
      crypto_sign/dilithium/dilithium3/avx2/fips202x4.c
  61. +64
    -0
      crypto_sign/dilithium/dilithium3/avx2/fips202x4.h
  62. +240
    -0
      crypto_sign/dilithium/dilithium3/avx2/invntt.S
  63. +199
    -0
      crypto_sign/dilithium/dilithium3/avx2/ntt.S
  64. +14
    -0
      crypto_sign/dilithium/dilithium3/avx2/ntt.h
  65. +261
    -0
      crypto_sign/dilithium/dilithium3/avx2/packing.c
  66. +31
    -0
      crypto_sign/dilithium/dilithium3/avx2/packing.h
  67. +41
    -0
      crypto_sign/dilithium/dilithium3/avx2/params.h
  68. +201
    -0
      crypto_sign/dilithium/dilithium3/avx2/pointwise.S
  69. +998
    -0
      crypto_sign/dilithium/dilithium3/avx2/poly.c
  70. +79
    -0
      crypto_sign/dilithium/dilithium3/avx2/poly.h
  71. +498
    -0
      crypto_sign/dilithium/dilithium3/avx2/polyvec.c
  72. +72
    -0
      crypto_sign/dilithium/dilithium3/avx2/polyvec.h
  73. +392
    -0
      crypto_sign/dilithium/dilithium3/avx2/rejsample.c
  74. +19
    -0
      crypto_sign/dilithium/dilithium3/avx2/rejsample.h
  75. +154
    -0
      crypto_sign/dilithium/dilithium3/avx2/rounding.c
  76. +12
    -0
      crypto_sign/dilithium/dilithium3/avx2/rounding.h
  77. +54
    -0
      crypto_sign/dilithium/dilithium3/avx2/shuffle.S
  78. +25
    -0
      crypto_sign/dilithium/dilithium3/avx2/shuffle.inc
  79. +425
    -0
      crypto_sign/dilithium/dilithium3/avx2/sign.c
  80. +29
    -0
      crypto_sign/dilithium/dilithium3/avx2/sign.h
  81. +26
    -0
      crypto_sign/dilithium/dilithium3/avx2/symmetric-shake.c
  82. +36
    -0
      crypto_sign/dilithium/dilithium3/avx2/symmetric.h
  83. +5
    -0
      crypto_sign/dilithium/dilithium3/clean/LICENSE
  84. +23
    -0
      crypto_sign/dilithium/dilithium3/clean/Makefile.Microsoft_nmake
  85. +32
    -0
      crypto_sign/dilithium/dilithium3/clean/api.h
  86. +98
    -0
      crypto_sign/dilithium/dilithium3/clean/ntt.c
  87. +10
    -0
      crypto_sign/dilithium/dilithium3/clean/ntt.h
  88. +261
    -0
      crypto_sign/dilithium/dilithium3/clean/packing.c
  89. +31
    -0
      crypto_sign/dilithium/dilithium3/clean/packing.h
  90. +41
    -0
      crypto_sign/dilithium/dilithium3/clean/params.h
  91. +818
    -0
      crypto_sign/dilithium/dilithium3/clean/poly.c
  92. +53
    -0
      crypto_sign/dilithium/dilithium3/clean/poly.h
  93. +448
    -0
      crypto_sign/dilithium/dilithium3/clean/polyvec.c
  94. +68
    -0
      crypto_sign/dilithium/dilithium3/clean/polyvec.h
  95. +69
    -0
      crypto_sign/dilithium/dilithium3/clean/reduce.c
  96. +17
    -0
      crypto_sign/dilithium/dilithium3/clean/reduce.h
  97. +92
    -0
      crypto_sign/dilithium/dilithium3/clean/rounding.c
  98. +14
    -0
      crypto_sign/dilithium/dilithium3/clean/rounding.h
  99. +343
    -0
      crypto_sign/dilithium/dilithium3/clean/sign.c
  100. +29
    -0
      crypto_sign/dilithium/dilithium3/clean/sign.h

+ 31
- 0
crypto_sign/dilithium/dilithium2/META.yml 查看文件

@@ -0,0 +1,31 @@
name: Dilithium2
type: signature
claimed-nist-level: 2
length-public-key: 1312
length-secret-key: 2544
length-signature: 2420
nistkat-sha256: 9c636528bf81c03df6ad8f9471cb1b4d9097d66af825d4f60b7ff0d941ca4d37
testvectors-sha256: 166fc2481358d5a1b7a528b30af36ad069b049b5755cf63b843ce0f25f35aeb6
principal-submitters:
- Vadim Lyubashevsky
auxiliary-submitters:
- Léo Ducas
- Eike Kiltz
- Tancrède Lepoint
- Peter Schwabe
- Gregor Seiler
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
- name: avx2
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- aes
- avx2
- popcnt

+ 5
- 0
crypto_sign/dilithium/dilithium2/avx2/LICENSE 查看文件

@@ -0,0 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 19
- 0
crypto_sign/dilithium/dilithium2/avx2/align.h 查看文件

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGN_H
#define PQCLEAN_DILITHIUM2_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[N]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGNED_INT32(N) \
union { \
int32_t coeffs[N]; \
__m256i vec[((N)+7)/8]; \
}

#endif

+ 31
- 0
crypto_sign/dilithium/dilithium2/avx2/api.h 查看文件

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_API_H
#define PQCLEAN_DILITHIUM2_AVX2_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1312
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2544
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2420
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2"


int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

+ 24
- 0
crypto_sign/dilithium/dilithium2/avx2/cdecl.h 查看文件

@@ -0,0 +1,24 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL_H
#define PQCLEAN_DILITHIUM2_AVX2_CDECL_H



#define _8XQ 0
#define _8XQINV 8
#define _8XDIV_QINV 16
#define _8XDIV 24
#define _ZETAS_QINV 32
#define _ZETAS 328

/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/

#define _cdecl(s) _##s
#define cdecl(s) s

#endif

+ 101
- 0
crypto_sign/dilithium/dilithium2/avx2/consts.c 查看文件

@@ -0,0 +1,101 @@
#include "consts.h"
#include "params.h"
#include <stdint.h>

#define QINV 58728449 // q^(-1) mod 2^32
#define MONT (-4186625) // 2^32 mod q
#define DIV 41978 // mont^2/256
#define DIV_QINV (-8395782)

const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata = {{
//#define _8XQ 0
Q, Q, Q, Q, Q, Q, Q, Q,

//#define _8XQINV 8
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

//#define _8XDIV_QINV 16
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV,

//#define _8XDIV 24
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV,

//#define _ZETAS_QINV 32
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244,
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077,
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561,
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417,
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735,
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904,
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771,
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600,
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139,
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433,
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547,
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852,
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995,
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424,
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315,
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951,
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031,
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878,
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606,
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568,
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583,
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093,
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172,
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187,
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462,
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722,
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279,
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342,
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272,
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682,
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363,
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473,
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426,
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762,
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494,
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853,
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238,

//#define _ZETAS 328
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451,
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905,
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855,
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103,
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928,
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549,
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672,
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928,
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771,
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969,
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969,
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922,
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430,
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856,
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961,
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995,
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100,
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149,
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738,
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098,
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455,
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634,
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424,
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622,
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115,
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233,
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154,
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838,
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642,
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107,
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782,
}
};

+ 10
- 0
crypto_sign/dilithium/dilithium2/avx2/consts.h 查看文件

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_CONSTS_H
#define PQCLEAN_DILITHIUM2_AVX2_CONSTS_H
#include "align.h"
#include "cdecl.h"


typedef ALIGNED_INT32(624) qdata_t;
extern const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata;

#endif

+ 909
- 0
crypto_sign/dilithium/dilithium2/avx2/f1600x4.S 查看文件

@@ -0,0 +1,909 @@
/* Taken from Bas Westerbaan's new 4-way SHAKE implementation
* for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/),
* but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */

#include "cdecl.h"

.data
.p2align 5
rho8:
.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14
rho56:
.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8

.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4)
cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4):
vmovdqa rho8(%rip), %ymm0
movq $6, %rax
looptop:
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 192(%rdi), %ymm4, %ymm9
vpxor 384(%rdi), %ymm3, %ymm10
vpxor 576(%rdi), %ymm2, %ymm11
vpxor 768(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 0(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 96(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 320(%rdi), %ymm5, %ymm10
vpxor 512(%rdi), %ymm4, %ymm11
vpxor 704(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 32(%rdi), %ymm4, %ymm8
vpxor 224(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 608(%rdi), %ymm1, %ymm11
vpxor 640(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 128(%rdi), %ymm1, %ymm8
vpxor 160(%rdi), %ymm5, %ymm9
vpxor 352(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 736(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 64(%rdi), %ymm3, %ymm8
vpxor 256(%rdi), %ymm2, %ymm9
vpxor 448(%rdi), %ymm1, %ymm10
vpxor 480(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 448(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 512(%rdi), %ymm4, %ymm9
vpxor 224(%rdi), %ymm3, %ymm10
vpxor 736(%rdi), %ymm2, %ymm11
vpxor 448(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 8(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 576(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 640(%rdi), %ymm5, %ymm10
vpxor 352(%rdi), %ymm4, %ymm11
vpxor 64(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 192(%rdi), %ymm4, %ymm8
vpxor 704(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 128(%rdi), %ymm1, %ymm11
vpxor 480(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 768(%rdi), %ymm1, %ymm8
vpxor 320(%rdi), %ymm5, %ymm9
vpxor 32(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 256(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 384(%rdi), %ymm3, %ymm8
vpxor 96(%rdi), %ymm2, %ymm9
vpxor 608(%rdi), %ymm1, %ymm10
vpxor 160(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 608(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 352(%rdi), %ymm4, %ymm9
vpxor 704(%rdi), %ymm3, %ymm10
vpxor 256(%rdi), %ymm2, %ymm11
vpxor 608(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 16(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 736(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 480(%rdi), %ymm5, %ymm10
vpxor 32(%rdi), %ymm4, %ymm11
vpxor 384(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 512(%rdi), %ymm4, %ymm8
vpxor 64(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 768(%rdi), %ymm1, %ymm11
vpxor 160(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 448(%rdi), %ymm1, %ymm8
vpxor 640(%rdi), %ymm5, %ymm9
vpxor 192(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 96(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 224(%rdi), %ymm3, %ymm8
vpxor 576(%rdi), %ymm2, %ymm9
vpxor 128(%rdi), %ymm1, %ymm10
vpxor 320(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 128(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 32(%rdi), %ymm4, %ymm9
vpxor 64(%rdi), %ymm3, %ymm10
vpxor 96(%rdi), %ymm2, %ymm11
vpxor 128(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 24(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 256(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 160(%rdi), %ymm5, %ymm10
vpxor 192(%rdi), %ymm4, %ymm11
vpxor 224(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 352(%rdi), %ymm4, %ymm8
vpxor 384(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 448(%rdi), %ymm1, %ymm11
vpxor 320(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 608(%rdi), %ymm1, %ymm8
vpxor 480(%rdi), %ymm5, %ymm9
vpxor 512(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 576(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 704(%rdi), %ymm3, %ymm8
vpxor 736(%rdi), %ymm2, %ymm9
vpxor 768(%rdi), %ymm1, %ymm10
vpxor 640(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 768(%rdi)
addq $32, %rsi
subq $1, %rax
jnz looptop
ret

+ 219
- 0
crypto_sign/dilithium/dilithium2/avx2/fips202x4.c 查看文件

@@ -0,0 +1,219 @@
#include "fips202.h"
#include "fips202x4.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#define NROUNDS 24

/* Keccak round constants */
static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
(uint64_t)0x0000000000000001ULL,
(uint64_t)0x0000000000008082ULL,
(uint64_t)0x800000000000808aULL,
(uint64_t)0x8000000080008000ULL,
(uint64_t)0x000000000000808bULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008009ULL,
(uint64_t)0x000000000000008aULL,
(uint64_t)0x0000000000000088ULL,
(uint64_t)0x0000000080008009ULL,
(uint64_t)0x000000008000000aULL,
(uint64_t)0x000000008000808bULL,
(uint64_t)0x800000000000008bULL,
(uint64_t)0x8000000000008089ULL,
(uint64_t)0x8000000000008003ULL,
(uint64_t)0x8000000000008002ULL,
(uint64_t)0x8000000000000080ULL,
(uint64_t)0x000000000000800aULL,
(uint64_t)0x800000008000000aULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008080ULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008008ULL
};

static void keccakx4_absorb_once(__m256i s[25],
unsigned int r,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen,
uint8_t p) {
size_t i;
uint64_t pos = 0;
__m256i t, idx;

for (i = 0; i < 25; ++i) {
s[i] = _mm256_setzero_si256();
}

idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
while (inlen >= r) {
for (i = 0; i < r / 8; ++i) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= r;

PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants);
}

for (i = 0; i < inlen / 8; ++i) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= 8 * i;

if (inlen) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1));
t = _mm256_and_si256(t, idx);
s[i] = _mm256_xor_si256(s[i], t);
}

t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen);
s[i] = _mm256_xor_si256(s[i], t);
t = _mm256_set1_epi64x((long long)(1ULL << 63));
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t);
}

static void keccakx4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
unsigned int r,
__m256i s[25]) {
unsigned int i;
__m128d t;

while (nblocks > 0) {
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
}

out0 += r;
out1 += r;
out2 += r;
out3 += r;
--nblocks;
}
}

void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
}

void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
}

void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
unsigned int i;
size_t nblocks = outlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
keccakx4_state state;

PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE128_RATE;
out1 += nblocks * SHAKE128_RATE;
out2 += nblocks * SHAKE128_RATE;
out3 += nblocks * SHAKE128_RATE;
outlen -= nblocks * SHAKE128_RATE;

if (outlen) {
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
unsigned int i;
size_t nblocks = outlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
keccakx4_state state;

PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE256_RATE;
out1 += nblocks * SHAKE256_RATE;
out2 += nblocks * SHAKE256_RATE;
out3 += nblocks * SHAKE256_RATE;
outlen -= nblocks * SHAKE256_RATE;

if (outlen) {
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

+ 64
- 0
crypto_sign/dilithium/dilithium2/avx2/fips202x4.h 查看文件

@@ -0,0 +1,64 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H
#define PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H

#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>

typedef struct {
__m256i s[25];
} keccakx4_state;

void PQCLEAN_DILITHIUM2_AVX2_f1600x4(__m256i *s, const uint64_t *rc);

void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

#endif

+ 240
- 0
crypto_sign/dilithium/dilithium2/avx2/invntt.S 查看文件

@@ -0,0 +1,240 @@
#include "cdecl.h"
.include "shuffle.inc"

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpsubd %ymm\l,%ymm\h,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vpmuldq %ymm\zl0,%ymm12,%ymm13
vmovshdup %ymm12,%ymm\h
vpmuldq %ymm\zl1,%ymm\h,%ymm14

vpmuldq %ymm\zh0,%ymm12,%ymm12
vpmuldq %ymm\zh1,%ymm\h,%ymm\h

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vpsubd %ymm13,%ymm12,%ymm12
vpsubd %ymm14,%ymm\h,%ymm\h

vmovshdup %ymm12,%ymm12
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h
.endm

.macro levels0t5 off
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

/* level 0 */
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,5,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 6,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,9,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 10,11,1,3,2,15

/* level 1 */
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,6,1,3,2,15
butterfly 5,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,10,1,3,2,15
butterfly 9,11,1,3,2,15

/* level 2 */
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,8,1,3,2,15
butterfly 5,9,1,3,2,15
butterfly 6,10,1,3,2,15
butterfly 7,11,1,3,2,15

/* level 3 */
shuffle2 4,5,3,5
shuffle2 6,7,4,7
shuffle2 8,9,6,9
shuffle2 10,11,8,11

vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2
butterfly 3,5
butterfly 4,7
butterfly 6,9
butterfly 8,11

/* level 4 */
shuffle4 3,4,10,4
shuffle4 6,8,3,8
shuffle4 5,7,6,7
shuffle4 9,11,5,11

vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2
butterfly 10,4
butterfly 3,8
butterfly 6,7
butterfly 5,11

/* level 5 */
shuffle8 10,3,9,3
shuffle8 6,5,10,5
shuffle8 4,8,6,8
shuffle8 7,11,4,11

vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2
butterfly 9,3
butterfly 10,5
butterfly 6,8
butterfly 4,11

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm10,256*\off+ 32(%rdi)
vmovdqa %ymm6,256*\off+ 64(%rdi)
vmovdqa %ymm4,256*\off+ 96(%rdi)
vmovdqa %ymm3,256*\off+128(%rdi)
vmovdqa %ymm5,256*\off+160(%rdi)
vmovdqa %ymm8,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

.macro levels6t7 off
vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

/* level 6 */
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

/* level 7 */
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)

vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1
vmovdqa (_8XDIV)*4(%rsi),%ymm2
vpmuldq %ymm1,%ymm4,%ymm12
vpmuldq %ymm1,%ymm5,%ymm13
vmovshdup %ymm4,%ymm8
vmovshdup %ymm5,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm4,%ymm4
vpmuldq %ymm2,%ymm5,%ymm5
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm4,%ymm4
vpsubd %ymm13,%ymm5,%ymm5
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm4,%ymm4
vmovshdup %ymm5,%ymm5
vpblendd $0xAA,%ymm8,%ymm4,%ymm4
vpblendd $0xAA,%ymm9,%ymm5,%ymm5

vpmuldq %ymm1,%ymm6,%ymm12
vpmuldq %ymm1,%ymm7,%ymm13
vmovshdup %ymm6,%ymm8
vmovshdup %ymm7,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm6,%ymm6
vpmuldq %ymm2,%ymm7,%ymm7
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm6,%ymm6
vpsubd %ymm13,%ymm7,%ymm7
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm6,%ymm6
vmovshdup %ymm7,%ymm7
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
vpblendd $0xAA,%ymm9,%ymm7,%ymm7

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
.endm

.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t5 0
levels0t5 1
levels0t5 2
levels0t5 3

levels6t7 0
levels6t7 1
levels6t7 2
levels6t7 3

ret

+ 199
- 0
crypto_sign/dilithium/dilithium2/avx2/ntt.S 查看文件

@@ -0,0 +1,199 @@
#include "cdecl.h"
.include "shuffle.inc"

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpmuldq %ymm\zl0,%ymm\h,%ymm13
vmovshdup %ymm\h,%ymm12
vpmuldq %ymm\zl1,%ymm12,%ymm14

vpmuldq %ymm\zh0,%ymm\h,%ymm\h
vpmuldq %ymm\zh1,%ymm12,%ymm12

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vmovshdup %ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h

vpsubd %ymm\h,%ymm\l,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vmovshdup %ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13

vpaddd %ymm13,%ymm12,%ymm\h
vpsubd %ymm13,%ymm\l,%ymm\l
.endm

.macro levels0t1 off
/* level 0 */
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2

vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

/* level 1 */
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)
.endm

.macro levels2t7 off
/* level 2 */
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

/* level 3 */
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2

butterfly 3,5
butterfly 8,10
butterfly 4,6
butterfly 9,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

/* level 4 */
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2

butterfly 7,8
butterfly 5,6
butterfly 3,4
butterfly 10,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

/* level 5 */
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15

butterfly 9,5,1,10,2,15
butterfly 8,4,1,10,2,15
butterfly 7,3,1,10,2,15
butterfly 6,11,1,10,2,15

/* level 6 */
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,7,1,10,2,15
butterfly 8,6,1,10,2,15

vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,3,1,10,2,15
butterfly 4,11,1,10,2,15

/* level 7 */
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,8,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 7,6,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,4,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 3,11,1,10,2,15

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm8,256*\off+ 32(%rdi)
vmovdqa %ymm7,256*\off+ 64(%rdi)
vmovdqa %ymm6,256*\off+ 96(%rdi)
vmovdqa %ymm5,256*\off+128(%rdi)
vmovdqa %ymm4,256*\off+160(%rdi)
vmovdqa %ymm3,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t1 0
levels0t1 1
levels0t1 2
levels0t1 3

levels2t7 0
levels2t7 1
levels2t7 2
levels2t7 3

ret


+ 14
- 0
crypto_sign/dilithium/dilithium2/avx2/ntt.h 查看文件

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTT_H
#define PQCLEAN_DILITHIUM2_AVX2_NTT_H

#include <immintrin.h>

void PQCLEAN_DILITHIUM2_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);
void PQCLEAN_DILITHIUM2_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);

void PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx(__m256i *a);

void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);

#endif

+ 261
- 0
crypto_sign/dilithium/dilithium2/avx2/packing.c 查看文件

@@ -0,0 +1,261 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk
*
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk
*
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig
*
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig
*
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

return 0;
}

+ 31
- 0
crypto_sign/dilithium/dilithium2/avx2/packing.h 查看文件

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H
#define PQCLEAN_DILITHIUM2_AVX2_PACKING_H
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]);

int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]);

#endif

+ 41
- 0
crypto_sign/dilithium/dilithium2/avx2/params.h 查看文件

@@ -0,0 +1,41 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_PARAMS_H
#define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define D 13
#define ROOT_OF_UNITY 1753

#define K 4
#define L 4
#define ETA 2
#define TAU 39
#define BETA 78
#define GAMMA1 (1 << 17)
#define GAMMA2 ((Q-1)/88)
#define OMEGA 80
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 576

#define POLYW1_PACKEDBYTES 192

#define POLYETA_PACKEDBYTES 96

#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 199
- 0
crypto_sign/dilithium/dilithium2/avx2/pointwise.S 查看文件

@@ -0,0 +1,199 @@
#include "params.h"
#include "cdecl.h"

.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
#consts
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop1:
#load
vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa 64(%rsi),%ymm6
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vmovdqa 64(%rdx),%ymm14
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vmovshdup %ymm6,%ymm7
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vmovshdup %ymm14,%ymm15

#mul
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5
vpmuldq %ymm6,%ymm14,%ymm6
vpmuldq %ymm7,%ymm15,%ymm7

#reduce
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm0,%ymm6,%ymm14
vpmuldq %ymm0,%ymm7,%ymm15
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpmuldq %ymm1,%ymm14,%ymm14
vpmuldq %ymm1,%ymm15,%ymm15
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsubq %ymm14,%ymm6,%ymm6
vpsubq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vmovshdup %ymm6,%ymm6

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4
vpblendd $0xAA,%ymm7,%ymm6,%ymm6
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm6,64(%rdi)

add $96,%rdi
add $96,%rsi
add $96,%rdx
add $1,%eax
cmp $10,%eax
jb _looptop1

vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5

#reduce
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vmovshdup %ymm4,%ymm4

#store
vpblendd $0x55,%ymm2,%ymm3,%ymm2
vpblendd $0x55,%ymm4,%ymm5,%ymm4
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

ret

.macro pointwise off
#load
vmovdqa \off(%rsi),%ymm6
vmovdqa \off+32(%rsi),%ymm8
vmovdqa \off(%rdx),%ymm10
vmovdqa \off+32(%rdx),%ymm12
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuldq %ymm6,%ymm10,%ymm6
vpmuldq %ymm7,%ymm11,%ymm7
vpmuldq %ymm8,%ymm12,%ymm8
vpmuldq %ymm9,%ymm13,%ymm9
.endm

.macro acc
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm7,%ymm3,%ymm3
vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5
.endm

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
#consts
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop2:
pointwise 0

#mov
vmovdqa %ymm6,%ymm2
vmovdqa %ymm7,%ymm3
vmovdqa %ymm8,%ymm4
vmovdqa %ymm9,%ymm5

pointwise 1024
acc

pointwise 2048
acc

pointwise 3072
acc




#reduce
vpmuldq %ymm0,%ymm2,%ymm6
vpmuldq %ymm0,%ymm3,%ymm7
vpmuldq %ymm0,%ymm4,%ymm8
vpmuldq %ymm0,%ymm5,%ymm9
vpmuldq %ymm1,%ymm6,%ymm6
vpmuldq %ymm1,%ymm7,%ymm7
vpmuldq %ymm1,%ymm8,%ymm8
vpmuldq %ymm1,%ymm9,%ymm9
vpsubq %ymm6,%ymm2,%ymm2
vpsubq %ymm7,%ymm3,%ymm3
vpsubq %ymm8,%ymm4,%ymm4
vpsubq %ymm9,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vmovshdup %ymm4,%ymm4

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4

vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

add $64,%rsi
add $64,%rdx
add $64,%rdi
add $1,%eax
cmp $16,%eax
jb _looptop2

ret

+ 1027
- 0
crypto_sign/dilithium/dilithium2/avx2/poly.c
文件差异内容过多而无法显示
查看文件


+ 79
- 0
crypto_sign/dilithium/dilithium2/avx2/poly.h 查看文件

@@ -0,0 +1,79 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLY_H
#define PQCLEAN_DILITHIUM2_AVX2_POLY_H
#include "align.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

typedef ALIGNED_INT32(N) poly;

void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t seed[CRHBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);

void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]);

void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a);

#endif

+ 474
- 0
crypto_sign/dilithium/dilithium2/avx2/polyvec.c 查看文件

@@ -0,0 +1,474 @@
#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

#define UNUSED(x) (void)x

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(&mat[0], NULL, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(&mat[1], NULL, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(&mat[2], NULL, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 256, 257, 258, 259);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 512, 513, 514, 515);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 768, 769, 770, 771);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}


void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) {
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - uint8_t *hint: pointer to output hint array
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) {
unsigned int i, n = 0;

for (i = 0; i < K; ++i) {
n += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]);
}

return n;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 72
- 0
crypto_sign/dilithium/dilithium2/avx2/polyvec.h 查看文件

@@ -0,0 +1,72 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H
#define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H
#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B);

/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 408
- 0
crypto_sign/dilithium/dilithium2/avx2/rejsample.c 查看文件

@@ -0,0 +1,408 @@
#include "params.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>

const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
unsigned int ctr, pos;
uint32_t good;
__m256i d, tmp;
const __m256i bound = _mm256_set1_epi32(Q);
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
-1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6,
-1, 5, 4, 3, -1, 2, 1, 0);

ctr = pos = 0;
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
d = _mm256_permute4x64_epi64(d, 0x94);
d = _mm256_shuffle_epi8(d, idx8);
d = _mm256_and_si256(d, mask);
pos += 24;

tmp = _mm256_sub_epi32(d, bound);
good = _mm256_movemask_ps((__m256)tmp);
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]));
d = _mm256_permutevar8x32_epi32(d, tmp);

_mm256_storeu_si256((__m256i *)&r[ctr], d);
ctr += _mm_popcnt_u32(good);

if (ctr > N - 8) {
break;
}
}

uint32_t t;
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
r[ctr++] = t;
}
}

return ctr;
}

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
unsigned int ctr, pos;
uint32_t good;
__m256i f0, f1, f2;
__m128i g0, g1;
const __m256i mask = _mm256_set1_epi8(15);
const __m256i eta = _mm256_set1_epi8(ETA);
const __m256i bound = mask;
const __m256i v = _mm256_set1_epi32(-6560);
const __m256i p = _mm256_set1_epi32(5);

ctr = pos = 0;
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
f1 = _mm256_slli_epi16(f0, 4);
f0 = _mm256_or_si256(f0, f1);
f0 = _mm256_and_si256(f0, mask);

f1 = _mm256_sub_epi8(f0, bound);
f0 = _mm256_sub_epi8(eta, f0);
good = _mm256_movemask_epi8(f1);

g0 = _mm256_castsi256_si128(f0);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm256_extracti128_si256(f0, 1);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good);
pos += 4;
}

uint32_t t0, t1;
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 < 15) {
t0 = t0 - (205 * t0 >> 10) * 5;
r[ctr++] = 2 - t0;
}
if (t1 < 15 && ctr < N) {
t1 = t1 - (205 * t1 >> 10) * 5;
r[ctr++] = 2 - t1;
}
}

return ctr;
}

+ 19
- 0
crypto_sign/dilithium/dilithium2/avx2/rejsample.h 查看文件

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H
#define PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES)

#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)

extern const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8];

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]);

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]);

#endif

+ 157
- 0
crypto_sign/dilithium/dilithium2/avx2/rounding.c 查看文件

@@ -0,0 +1,157 @@
#include "consts.h"
#include "params.h"
#include "rejsample.h"
#include "rounding.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

#define _mm256_blendv_epi32(a,b,mask) \
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
_mm256_castsi256_ps(b), \
_mm256_castsi256_ps(mask)))

/*************************************************
* Name: power2round
*
* Description: For finite field elements a, compute a0, a1 such that
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be positive standard representative.
*
* Arguments: - __m256i *a1: output array of length N/8 with high bits
* - __m256i *a0: output array of length N/8 with low bits a0
* - const __m256i *a: input array of length N/8
*
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1;
const __m256i mask = _mm256_set1_epi32(-(1 << D));
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1);

for (i = 0; i < N / 8; ++i) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, half);
f0 = _mm256_and_si256(f1, mask);
f1 = _mm256_srli_epi32(f1, D);
f0 = _mm256_sub_epi32(f, f0);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: decompose
*
* Description: For finite field element a, compute high and low parts a0, a1 such
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard
* representative.
*
* Arguments: - __m256i *a1: output array of length N/8 with high parts
* - __m256i *a0: output array of length N/8 with low parts a0
* - const __m256i *a: input array of length N/8
*
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1, t;
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]);
const __m256i hq = _mm256_srli_epi32(q, 1);
const __m256i v = _mm256_set1_epi32(11275);
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2);
const __m256i off = _mm256_set1_epi32(127);
const __m256i shift = _mm256_set1_epi32(128);
const __m256i max = _mm256_set1_epi32(43);
const __m256i zero = _mm256_setzero_si256();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, off);
f1 = _mm256_srli_epi32(f1, 7);
f1 = _mm256_mulhi_epu16(f1, v);
f1 = _mm256_mulhrs_epi16(f1, shift);
t = _mm256_sub_epi32(max, f1);
f1 = _mm256_blendv_epi32(f1, zero, t);
f0 = _mm256_mullo_epi32(f1, alpha);
f0 = _mm256_sub_epi32(f, f0);
f = _mm256_cmpgt_epi32(f0, hq);
f = _mm256_and_si256(f, q);
f0 = _mm256_sub_epi32(f0, f);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: make_hint
*
* Description: Compute indices of polynomial coefficients whose low bits
* overflow into the high bits.
*
* Arguments: - uint8_t *hint: hint array
* - const __m256i *a0: low bits of input elements
* - const __m256i *a1: high bits of input elements
*
* Returns number of overflowing low bits
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) {
unsigned int i, n = 0;
__m256i f0, f1, g0, g1;
uint32_t bad;
uint64_t idx;
const __m256i low = _mm256_set1_epi32(-GAMMA2);
const __m256i high = _mm256_set1_epi32(GAMMA2);

for (i = 0; i < N / 8; ++i) {
f0 = _mm256_load_si256(&a0[i]);
f1 = _mm256_load_si256(&a1[i]);
g0 = _mm256_abs_epi32(f0);
g0 = _mm256_cmpgt_epi32(g0, high);
g1 = _mm256_cmpeq_epi32(f0, low);
g1 = _mm256_sign_epi32(g1, f1);
g0 = _mm256_or_si256(g0, g1);

bad = _mm256_movemask_ps((__m256)g0);
memcpy(&idx, PQCLEAN_DILITHIUM2_AVX2_idxlut[bad], 8);
idx += (uint64_t)0x0808080808080808 * i;
memcpy(&hint[n], &idx, 8);
n += _mm_popcnt_u32(bad);
}

return n;
}

/*************************************************
* Name: use_hint
*
* Description: Correct high parts according to hint.
*
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts
* - const __m256i *a: input array of length N/8
* - const __m256i *a: input array of length N/8 with hint bits
*
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) {
unsigned int i;
__m256i a0[N / 8];
__m256i f, g, h, t;
const __m256i zero = _mm256_setzero_si256();
const __m256i max = _mm256_set1_epi32(43);

PQCLEAN_DILITHIUM2_AVX2_decompose_avx(b, a0, a);
for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a0[i]);
g = _mm256_load_si256(&b[i]);
h = _mm256_load_si256(&hint[i]);
t = _mm256_blendv_epi32(zero, h, f);
t = _mm256_slli_epi32(t, 1);
h = _mm256_sub_epi32(h, t);
g = _mm256_add_epi32(g, h);
g = _mm256_blendv_epi32(g, max, g);
f = _mm256_cmpgt_epi32(g, max);
g = _mm256_blendv_epi32(g, zero, f);
_mm256_store_si256(&b[i], g);
}
}

+ 12
- 0
crypto_sign/dilithium/dilithium2/avx2/rounding.h 查看文件

@@ -0,0 +1,12 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H
#define PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H
#include "params.h"
#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a);
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a);
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1);
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint);

#endif

+ 54
- 0
crypto_sign/dilithium/dilithium2/avx2/shuffle.S 查看文件

@@ -0,0 +1,54 @@
#include "cdecl.h"
.include "shuffle.inc"

.text
nttunpack128_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

#store
vmovdqa %ymm9,(%rdi)
vmovdqa %ymm8,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm6,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm3,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx):
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
ret

+ 25
- 0
crypto_sign/dilithium/dilithium2/avx2/shuffle.inc 查看文件

@@ -0,0 +1,25 @@
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

+ 415
- 0
crypto_sign/dilithium/dilithium2/avx2/sign.c 查看文件

@@ -0,0 +1,415 @@
#include "align.h"
#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"
#include <stdint.h>
#include <string.h>

static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) {
switch (i) {
case 0:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho);
*row = buf;
break;
case 1:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho);
*row = buf + 1;
break;
case 2:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho);
*row = buf;
break;
case 3:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho);
*row = buf + 1;
break;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
uint8_t seedbuf[3 * SEEDBYTES];
const uint8_t *rho, *rhoprime, *key;
polyvecl rowbuf[2];
polyvecl s1, *row = rowbuf;
polyveck s2;
poly t1, t0;

/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Store rho, key */
memcpy(pk, rho, SEEDBYTES);
memcpy(sk, rho, SEEDBYTES);
memcpy(sk + SEEDBYTES, key, SEEDBYTES);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, 4, 5, 6, 7);

/* Pack secret vectors */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]);
}
for (i = 0; i < K; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]);
}

/* Transform s1 */
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1);


for (i = 0; i < K; i++) {
/* Expand matrix row */
polyvec_matrix_expand_row(&row, rowbuf, rho, i);

/* Compute inner-product */
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&t1);

/* Add error polynomial */
PQCLEAN_DILITHIUM2_AVX2_poly_add(&t1, &t1, &s2.vec[i]);

/* Round t and pack t1, t0 */
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&t1);
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1);
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0);
}

/* Compute CRH(rho, t1) and store in secret key */
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature
*
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
unsigned int i, n, pos;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint8_t hintbuf[N];
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
uint64_t nonce = 0;
polyvecl mat[K], s1, z;
polyveck t0, s2, w1;
poly c, tmp;
union {
polyvecl y;
polyveck w0;
} tmpv;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

/* Compute CRH(tr, msg) */
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0);


rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3],
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3);
nonce += 4;

/* Matrix-vector product */
tmpv.y = z;
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&tmpv.y);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y);
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(sig, &w1);

shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c);

/* Compute z, reject if it reveals secret */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&z.vec[i]);
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) {
goto rej;
}
}

/* Zero hint vector in signature */
pos = 0;
memset(hint, 0, OMEGA);

for (i = 0; i < K; i++) {
/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmpv.w0.vec[i]);
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) {
goto rej;
}

/* Compute hints */
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmp);
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmp, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM2_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
n = PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]);
if (pos + n > OMEGA) {
goto rej;
}

/* Store hints in signature */
memcpy(&hint[pos], hintbuf, n);
hint[OMEGA + i] = pos = pos + n;
}

/* Pack z into signature */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]);
}

*siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
size_t i;

for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify
*
* Description: Verifies signature.
*
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) {
unsigned int i, j, pos = 0;
/* PQCLEAN_DILITHIUM2_AVX2_polyw1_pack writes additional 14 bytes */
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf;
uint8_t mu[CRHBYTES];
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
polyvecl rowbuf[2];
polyvecl *row = rowbuf;
polyvecl z;
poly c, w1, h;
shake256incctx state;

if (siglen != PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

/* Expand PQCLEAN_DILITHIUM2_AVX2_challenge */
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c);

/* Unpack z; shortness follows from unpacking */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&z.vec[i]);
}


for (i = 0; i < K; i++) {
/* Expand matrix row */
polyvec_matrix_expand_row(&row, rowbuf, pk, i);

/* Compute i-th row of Az - c2^Dt1 */
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z);

PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES);
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&h);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&h);
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&h, &c, &h);

PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w1, &w1, &h);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w1);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&w1);

/* Get hint polynomial and reconstruct w1 */
memset(h.vec, 0, sizeof(poly));
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) {
return -1;
}

for (j = pos; j < hint[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > pos && hint[j] <= hint[j - 1]) {
return -1;
}
h.coeffs[hint[j]] = 1;
}
pos = hint[OMEGA + i];

PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&w1);
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1);
}

/* Extra indices are zero for strong unforgeability */
for (j = pos; j < OMEGA; ++j) {
if (hint[j]) {
return -1;
}
}

/* Call random oracle and verify PQCLEAN_DILITHIUM2_AVX2_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (buf.coeffs[i] != sig[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
goto badsig;
}

*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i];
}
return 0;
}

badsig:
/* Signature verification failed */
*mlen = -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 29
- 0
crypto_sign/dilithium/dilithium2/avx2/sign.h 查看文件

@@ -0,0 +1,29 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_SIGN_H
#define PQCLEAN_DILITHIUM2_AVX2_SIGN_H
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

+ 26
- 0
crypto_sign/dilithium/dilithium2/avx2/symmetric-shake.c 查看文件

@@ -0,0 +1,26 @@
#include "fips202.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake128_inc_init(state);
shake128_inc_absorb(state, seed, SEEDBYTES);
shake128_inc_absorb(state, t, 2);
shake128_inc_finalize(state);
}

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake256_inc_init(state);
shake256_inc_absorb(state, seed, CRHBYTES);
shake256_inc_absorb(state, t, 2);
shake256_inc_finalize(state);
}

+ 36
- 0
crypto_sign/dilithium/dilithium2/avx2/symmetric.h 查看文件

@@ -0,0 +1,36 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H
#include "fips202.h"
#include "params.h"
#include <stdint.h>



typedef shake128incctx stream128_state;
typedef shake256incctx stream256_state;

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state,
const uint8_t seed[CRHBYTES],
uint16_t nonce);

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE)
#define stream128_release(STATE) shake128_inc_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE)
#define stream256_release(STATE) shake256_inc_ctx_release(STATE)


#endif

+ 5
- 0
crypto_sign/dilithium/dilithium2/clean/LICENSE 查看文件

@@ -0,0 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 23
- 0
crypto_sign/dilithium/dilithium2/clean/Makefile.Microsoft_nmake 查看文件

@@ -0,0 +1,23 @@
# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libdilithium2_clean.lib
OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as
# long as there has been a C standard, and we need it for constant-time
# computations. Thus, we disable that spurious warning.
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146

all: $(LIBRARY)

# Make sure objects are recompiled if headers change.
$(OBJECTS): *.h

$(LIBRARY): $(OBJECTS)
LIB.EXE /NOLOGO /WX /OUT:$@ $**

clean:
-DEL $(OBJECTS)
-DEL $(LIBRARY)

+ 31
- 0
crypto_sign/dilithium/dilithium2/clean/api.h 查看文件

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_API_H
#define PQCLEAN_DILITHIUM2_CLEAN_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1312
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2544
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2420
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2"


int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

+ 98
- 0
crypto_sign/dilithium/dilithium2/clean/ntt.c 查看文件

@@ -0,0 +1,98 @@
#include "ntt.h"
#include "params.h"
#include "reduce.h"
#include <stdint.h>

static const int32_t zetas[N] = {
0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103,
2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549,
-2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221,
-1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922,
3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047,
-671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430,
-3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618,
-3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856,
189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330,
1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961,
2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462,
266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378,
900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500,
-655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838,
342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044,
2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974,
-3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970,
-1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642,
-1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031,
-542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993,
-2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385,
-3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107,
-3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078,
-426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893,
-2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687,
-554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782
};

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_ntt
*
* Description: Forward NTT, in-place. No modular reduction is performed after
* additions or subtractions. Output vector is in bitreversed order.
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]) {
unsigned int len, start, j, k;
int32_t zeta, t;

k = 0;
for (len = 128; len > 0; len >>= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas[++k];
for (j = start; j < start + len; ++j) {
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]);
a[j + len] = a[j] - t;
a[j] = a[j] + t;
}
}
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont
*
* Description: Inverse NTT and multiplication by Montgomery factor 2^32.
* In-place. No modular reductions after additions or
* subtractions; input coefficients need to be smaller than
* Q in absolute value. Output coefficient are smaller than Q in
* absolute value.
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]) {
unsigned int start, len, j, k;
int32_t t, zeta;
const int32_t f = 41978; // mont^2/256

k = 256;
for (len = 1; len < N; len <<= 1) {
for (start = 0; start < N; start = j + len) {
zeta = -zetas[--k];
for (j = start; j < start + len; ++j) {
t = a[j];
a[j] = t + a[j + len];
a[j + len] = t - a[j + len];
a[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]);
}
}
}

for (j = 0; j < N; ++j) {
a[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)f * a[j]);
}
}

+ 10
- 0
crypto_sign/dilithium/dilithium2/clean/ntt.h 查看文件

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H
#define PQCLEAN_DILITHIUM2_CLEAN_NTT_H
#include "params.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]);

void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]);

#endif

+ 261
- 0
crypto_sign/dilithium/dilithium2/clean/packing.c 查看文件

@@ -0,0 +1,261 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk
*
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk
*
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig
*
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig
*
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

return 0;
}

+ 31
- 0
crypto_sign/dilithium/dilithium2/clean/packing.h 查看文件

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H
#define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]);

int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]);

#endif

+ 41
- 0
crypto_sign/dilithium/dilithium2/clean/params.h 查看文件

@@ -0,0 +1,41 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H
#define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define D 13
#define ROOT_OF_UNITY 1753

#define K 4
#define L 4
#define ETA 2
#define TAU 39
#define BETA 78
#define GAMMA1 (1 << 17)
#define GAMMA2 ((Q-1)/88)
#define OMEGA 80
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 576

#define POLYW1_PACKEDBYTES 192

#define POLYETA_PACKEDBYTES 96

#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 867
- 0
crypto_sign/dilithium/dilithium2/clean/poly.c 查看文件

@@ -0,0 +1,867 @@
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "rounding.h"
#include "symmetric.h"
#include <stdint.h>

#define DBENCH_START()
#define DBENCH_STOP(t)

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_reduce
*
* Description: Inplace reduction of all coefficients of polynomial to
* representative in [-6283009,6283007].
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a->coeffs[i]);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_caddq
*
* Description: For all coefficients of in/out polynomial add Q if
* coefficient is negative.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_caddq(a->coeffs[i]);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_freeze
*
* Description: Inplace reduction of all coefficients of polynomial to
* standard representatives.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_freeze(a->coeffs[i]);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_add
*
* Description: Add polynomials. No modular reduction is performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
}

DBENCH_STOP(*tadd);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_sub
*
* Description: Subtract polynomials. No modular reduction is
* performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
}

DBENCH_STOP(*tadd);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl
*
* Description: Multiply polynomial by 2^D without modular reduction. Assumes
* input coefficients to be less than 2^{31-D} in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] <<= D;
}

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_ntt
*
* Description: Inplace forward NTT. Coefficients can grow by
* 8*Q in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2_CLEAN_ntt(a->coeffs);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont
*
* Description: Inplace inverse NTT and multiplication by 2^{32}.
* Input coefficients need to be less than Q in absolute
* value and output coefficients are again bounded by Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(a->coeffs);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery
*
* Description: Pointwise multiplication of polynomials in NTT domain
* representation and multiplication of resulting polynomial
* by 2^{-32}.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
c->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
}

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_power2round
*
* Description: For all coefficients c of the input polynomial,
* compute c0, c1 such that c mod Q = c1*2^D + c0
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients c0
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]);
}

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_decompose
*
* Description: For all coefficients c of the input polynomial,
* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients c0
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]);
}

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint
*
* Description: Compute hint polynomial. The coefficients of which indicate
* whether the low bits of the corresponding coefficient of
* the input polynomial overflow into the high bits.
*
* Arguments: - poly *h: pointer to output hint polynomial
* - const poly *a0: pointer to low part of input polynomial
* - const poly *a1: pointer to high part of input polynomial
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) {
unsigned int i, s = 0;
DBENCH_START();

for (i = 0; i < N; ++i) {
h->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]);
s += h->coeffs[i];
}

DBENCH_STOP(*tround);
return s;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint
*
* Description: Use hint polynomial to correct the high bits of a polynomial.
*
* Arguments: - poly *b: pointer to output polynomial with corrected high bits
* - const poly *a: pointer to input polynomial
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
b->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]);
}

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm
*
* Description: Check infinity norm of polynomial against given bound.
* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM2_CLEAN_reduce32().
*
* Arguments: - const poly *a: pointer to polynomial
* - int32_t B: norm bound
*
* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B) {
unsigned int i;
int32_t t;
DBENCH_START();

if (B > (Q - 1) / 8) {
return 1;
}

/* It is ok to leak which coefficient violates the bound since
the probability for each coefficient is independent of secret
data but we must not leak the sign of the centralized representative. */
for (i = 0; i < N; ++i) {
/* Absolute value */
t = a->coeffs[i] >> 31;
t = a->coeffs[i] - (t & 2 * a->coeffs[i]);

if (t >= B) {
DBENCH_STOP(*tsample);
return 1;
}
}

DBENCH_STOP(*tsample);
return 0;
}

/*************************************************
* Name: rej_uniform
*
* Description: Sample uniformly random coefficients in [0, Q-1] by
* performing rejection sampling on array of random bytes.
*
* Arguments: - int32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_uniform(int32_t *a,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t;
DBENCH_START();

ctr = pos = 0;
while (ctr < len && pos + 3 <= buflen) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
a[ctr++] = t;
}
}

DBENCH_STOP(*tsample);
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform
*
* Description: Sample polynomial with uniformly random coefficients
* in [0,Q-1] by performing rejection sampling on the
* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES;
uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);

ctr = rej_uniform(a->coeffs, N, buf, buflen);

while (ctr < N) {
off = buflen % 3;
for (i = 0; i < off; ++i) {
buf[i] = buf[buflen - off + i];
}

stream128_squeezeblocks(buf + off, 1, &state);
buflen = STREAM128_BLOCKBYTES + off;
ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen);
}
stream128_release(&state);
}

/*************************************************
* Name: rej_eta
*
* Description: Sample uniformly random coefficients in [-ETA, ETA] by
* performing rejection sampling on array of random bytes.
*
* Arguments: - int32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_eta(int32_t *a,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
DBENCH_START();

ctr = pos = 0;
while (ctr < len && pos < buflen) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 < 15) {
t0 = t0 - (205 * t0 >> 10) * 5;
a[ctr++] = 2 - t0;
}
if (t1 < 15 && ctr < len) {
t1 = t1 - (205 * t1 >> 10) * 5;
a[ctr++] = 2 - t1;
}
}

DBENCH_STOP(*tsample);
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta
*
* Description: Sample polynomial with uniformly random coefficients
* in [-ETA,ETA] by performing rejection sampling on the
* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce) {
unsigned int ctr;
unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES;
uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);

ctr = rej_eta(a->coeffs, N, buf, buflen);

while (ctr < N) {
stream128_squeezeblocks(buf, 1, &state);
ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES);
}
stream128_release(&state);
}

/*************************************************
* Name: poly_uniform_gamma1m1
*
* Description: Sample polynomial with uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a,
const uint8_t seed[CRHBYTES],
uint16_t nonce) {
uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES];
stream256_state state;

stream256_init(&state, seed, nonce);
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
stream256_release(&state);
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(a, buf);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge
*
* Description: Implementation of H. Samples polynomial with TAU nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(seed).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) {
unsigned int i, b, pos;
uint64_t signs;
uint8_t buf[SHAKE256_RATE];
shake256incctx state;

shake256_inc_init(&state);
shake256_inc_absorb(&state, seed, SEEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf, sizeof buf, &state);

signs = 0;
for (i = 0; i < 8; ++i) {
signs |= (uint64_t)buf[i] << 8 * i;
}
pos = 8;

for (i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}
for (i = N - TAU; i < N; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_inc_squeeze(buf, sizeof buf, &state);
pos = 0;
}

b = buf[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1 - 2 * (signs & 1);
signs >>= 1;
}
shake256_inc_ctx_release(&state);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack
*
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYETA_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a) {
unsigned int i;
uint8_t t[8];
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
t[0] = (uint8_t) (ETA - a->coeffs[8 * i + 0]);
t[1] = (uint8_t) (ETA - a->coeffs[8 * i + 1]);
t[2] = (uint8_t) (ETA - a->coeffs[8 * i + 2]);
t[3] = (uint8_t) (ETA - a->coeffs[8 * i + 3]);
t[4] = (uint8_t) (ETA - a->coeffs[8 * i + 4]);
t[5] = (uint8_t) (ETA - a->coeffs[8 * i + 5]);
t[6] = (uint8_t) (ETA - a->coeffs[8 * i + 6]);
t[7] = (uint8_t) (ETA - a->coeffs[8 * i + 7]);

r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack
*
* Description: Unpack polynomial with coefficients in [-ETA,ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7;
r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7;
r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7;
r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7;
r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7;
r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7;
r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7;
r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7;

r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0];
r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1];
r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2];
r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3];
r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4];
r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5];
r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6];
r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYT1_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0);
r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2));
r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4));
r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6));
r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack
*
* Description: Unpack polynomial t1 with 10-bit coefficients.
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF;
r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF;
r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF;
r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF;
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack
*
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYT0_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a) {
unsigned int i;
uint32_t t[8];
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0];
t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1];
t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2];
t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3];
t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4];
t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5];
t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6];
t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7];

r[13 * i + 0] = (uint8_t) t[0];
r[13 * i + 1] = (uint8_t) (t[0] >> 8);
r[13 * i + 1] |= (uint8_t) (t[1] << 5);
r[13 * i + 2] = (uint8_t) (t[1] >> 3);
r[13 * i + 3] = (uint8_t) (t[1] >> 11);
r[13 * i + 3] |= (uint8_t) (t[2] << 2);
r[13 * i + 4] = (uint8_t) (t[2] >> 6);
r[13 * i + 4] |= (uint8_t) (t[3] << 7);
r[13 * i + 5] = (uint8_t) (t[3] >> 1);
r[13 * i + 6] = (uint8_t) (t[3] >> 9);
r[13 * i + 6] |= (uint8_t) (t[4] << 4);
r[13 * i + 7] = (uint8_t) (t[4] >> 4);
r[13 * i + 8] = (uint8_t) (t[4] >> 12);
r[13 * i + 8] |= (uint8_t) (t[5] << 1);
r[13 * i + 9] = (uint8_t) (t[5] >> 7);
r[13 * i + 9] |= (uint8_t) (t[6] << 6);
r[13 * i + 10] = (uint8_t) (t[6] >> 2);
r[13 * i + 11] = (uint8_t) (t[6] >> 10);
r[13 * i + 11] |= (uint8_t) (t[7] << 3);
r[13 * i + 12] = (uint8_t) (t[7] >> 5);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack
*
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = a[13 * i + 0];
r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8;
r->coeffs[8 * i + 0] &= 0x1FFF;

r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5;
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3;
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11;
r->coeffs[8 * i + 1] &= 0x1FFF;

r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2;
r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6;
r->coeffs[8 * i + 2] &= 0x1FFF;

r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7;
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1;
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9;
r->coeffs[8 * i + 3] &= 0x1FFF;

r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4;
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4;
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12;
r->coeffs[8 * i + 4] &= 0x1FFF;

r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1;
r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7;
r->coeffs[8 * i + 5] &= 0x1FFF;

r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6;
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2;
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10;
r->coeffs[8 * i + 6] &= 0x1FFF;

r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3;
r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5;
r->coeffs[8 * i + 7] &= 0x1FFF;

r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0];
r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1];
r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2];
r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3];
r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4];
r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5];
r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6];
r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyz_pack
*
* Description: Bit-pack polynomial with coefficients
* in [-(GAMMA1 - 1), GAMMA1].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYZ_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a) {
unsigned int i;
uint32_t t[4];
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
t[0] = GAMMA1 - a->coeffs[4 * i + 0];
t[1] = GAMMA1 - a->coeffs[4 * i + 1];
t[2] = GAMMA1 - a->coeffs[4 * i + 2];
t[3] = GAMMA1 - a->coeffs[4 * i + 3];

r[9 * i + 0] = (uint8_t) t[0];
r[9 * i + 1] = (uint8_t) (t[0] >> 8);
r[9 * i + 2] = (uint8_t) (t[0] >> 16);
r[9 * i + 2] |= (uint8_t) (t[1] << 2);
r[9 * i + 3] = (uint8_t) (t[1] >> 6);
r[9 * i + 4] = (uint8_t) (t[1] >> 14);
r[9 * i + 4] |= (uint8_t) (t[2] << 4);
r[9 * i + 5] = (uint8_t) (t[2] >> 4);
r[9 * i + 6] = (uint8_t) (t[2] >> 12);
r[9 * i + 6] |= (uint8_t) (t[3] << 6);
r[9 * i + 7] = (uint8_t) (t[3] >> 2);
r[9 * i + 8] = (uint8_t) (t[3] >> 10);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack
*
* Description: Unpack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[9 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8;
r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16;
r->coeffs[4 * i + 0] &= 0x3FFFF;

r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2;
r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6;
r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14;
r->coeffs[4 * i + 1] &= 0x3FFFF;

r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4;
r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12;
r->coeffs[4 * i + 2] &= 0x3FFFF;

r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6;
r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2;
r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10;
r->coeffs[4 * i + 3] &= 0x3FFFF;

r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0];
r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1];
r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2];
r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack
*
* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYW1_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r[3 * i + 0] = (uint8_t) a->coeffs[4 * i + 0];
r[3 * i + 0] |= (uint8_t) (a->coeffs[4 * i + 1] << 6);
r[3 * i + 1] = (uint8_t) (a->coeffs[4 * i + 1] >> 2);
r[3 * i + 1] |= (uint8_t) (a->coeffs[4 * i + 2] << 4);
r[3 * i + 2] = (uint8_t) (a->coeffs[4 * i + 2] >> 4);
r[3 * i + 2] |= (uint8_t) (a->coeffs[4 * i + 3] << 2);
}

DBENCH_STOP(*tpack);
}

+ 53
- 0
crypto_sign/dilithium/dilithium2/clean/poly.h 查看文件

@@ -0,0 +1,53 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H
#define PQCLEAN_DILITHIUM2_CLEAN_POLY_H
#include "params.h"
#include <stdint.h>

typedef struct {
int32_t coeffs[N];
} poly;

void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a,
const uint8_t seed[CRHBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a);

#endif

+ 448
- 0
crypto_sign/dilithium/dilithium2/clean/polyvec.c 查看文件

@@ -0,0 +1,448 @@
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
unsigned int i, j;

for (i = 0; i < K; ++i) {
for (j = 0; j < L; ++j) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j));
}
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i));
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v) {
unsigned int i;
poly t;

PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
for (i = 1; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_add(w, w, &t);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - polyveck *h: pointer to output vector
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1) {
unsigned int i, s = 0;

for (i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
}

return s;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 68
- 0
crypto_sign/dilithium/dilithium2/clean/polyvec.h 查看文件

@@ -0,0 +1,68 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H
#define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H
#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);


int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B);



/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 69
- 0
crypto_sign/dilithium/dilithium2/clean/reduce.c 查看文件

@@ -0,0 +1,69 @@
#include "params.h"
#include "reduce.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce
*
* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
*
* Arguments: - int64_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a) {
int32_t t;

t = (int32_t)((uint64_t)a * (uint64_t)QINV);
t = (a - (int64_t)t * Q) >> 32;
return t;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32
*
* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007.
*
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a) {
int32_t t;

t = (a + (1 << 22)) >> 23;
t = a - t * Q;
return t;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_caddq
*
* Description: Add Q if input coefficient is negative.
*
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a) {
a += (a >> 31) & Q;
return a;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_freeze
*
* Description: For finite field element a, compute standard
* representative r = a mod^+ Q.
*
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a) {
a = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a);
a = PQCLEAN_DILITHIUM2_CLEAN_caddq(a);
return a;
}

+ 17
- 0
crypto_sign/dilithium/dilithium2/clean/reduce.h 查看文件

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H
#define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H
#include "params.h"
#include <stdint.h>

#define MONT (-4186625) // 2^32 % Q
#define QINV 58728449 // q^(-1) mod 2^32

int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a);

int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a);

int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a);

int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a);

#endif

+ 98
- 0
crypto_sign/dilithium/dilithium2/clean/rounding.c 查看文件

@@ -0,0 +1,98 @@
#include "params.h"
#include "rounding.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be standard representative.
*
* Arguments: - int32_t a: input element
* - int32_t *a0: pointer to output element a0
*
* Returns a1.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a) {
int32_t a1;

a1 = (a + (1 << (D - 1)) - 1) >> D;
*a0 = a - (a1 << D);
return a1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
* representative.
*
* Arguments: - int32_t a: input element
* - int32_t *a0: pointer to output element a0
*
* Returns a1.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) {
int32_t a1;

a1 = (a + 127) >> 7;
a1 = (a1 * 11275 + (1 << 23)) >> 24;
a1 ^= ((43 - a1) >> 31) & a1;

*a0 = a - a1 * 2 * GAMMA2;
*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q;
return a1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits.
*
* Arguments: - int32_t a0: low bits of input element
* - int32_t a1: high bits of input element
*
* Returns 1 if overflow.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1) {
if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) {
return 1;
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_use_hint
*
* Description: Correct high bits according to hint.
*
* Arguments: - int32_t a: input element
* - unsigned int hint: hint bit
*
* Returns corrected high bits.
**************************************************/
int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint) {
int32_t a0, a1;

a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0, a);
if (hint == 0) {
return a1;
}

if (a0 > 0) {
if (a1 == 43) {
return 0;
}
return a1 + 1;
}
if (a1 == 0) {
return 43;
}
return a1 - 1;
}

+ 14
- 0
crypto_sign/dilithium/dilithium2/clean/rounding.h 查看文件

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H
#define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H
#include "params.h"
#include <stdint.h>

int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a);

int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a);

unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1);

int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint);

#endif

+ 343
- 0
crypto_sign/dilithium/dilithium2/clean/sign.c 查看文件

@@ -0,0 +1,343 @@
#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
polyvecl mat[K];
polyvecl s1, s1hat;
polyveck s2, t1, t0;

/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Expand matrix */
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L);

/* Matrix-vector multiplication */
s1hat = s1;
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1hat);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&t1);

/* Add error vector s2 */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t1, &t1, &s2);

/* Extract t1 and write public key */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature
*
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig,
size_t *siglen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
unsigned int n;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint16_t nonce = 0;
polyvecl mat[K], s1, y, z;
polyveck t0, s2, w1, w0, h;
poly cp;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

/* Compute CRH(tr, msg) */
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t0);

rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++);

/* Matrix-vector multiplication */
z = y;
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(sig, &w1);

shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, sig);
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp);

/* Compute z, reject if it reveals secret */
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(&z);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(&z, &z, &y);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(&z);
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
goto rej;
}

/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w0);
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}

/* Compute hints for w1 */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&h);
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&h, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &h);
n = PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(&h, &w0, &w1);
if (n > OMEGA) {
goto rej;
}

/* Write signature */
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, sig, &z, &h);
*siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm,
size_t *smlen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
size_t i;

for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify
*
* Description: Verifies signature.
*
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig,
size_t siglen,
const uint8_t *m,
size_t mlen,
const uint8_t *pk) {
unsigned int i;
uint8_t buf[K * POLYW1_PACKEDBYTES];
uint8_t rho[SEEDBYTES];
uint8_t mu[CRHBYTES];
uint8_t c[SEEDBYTES];
uint8_t c2[SEEDBYTES];
poly cp;
polyvecl mat[K], z;
polyveck t1, w1, h;
shake256incctx state;

if (siglen != PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
return -1;
}

PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(rho, &t1, pk);
if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(c, &z, &h, sig)) {
return -1;
}
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES);
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, c);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho);

PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);

PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);

PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w1, &w1, &t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1);

/* Reconstruct w1 */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(buf, &w1);

/* Call random oracle and verify PQCLEAN_DILITHIUM2_CLEAN_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(c2, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (c[i] != c2[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m,
size_t *mlen,
const uint8_t *sm,
size_t smlen,
const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
goto badsig;
}

*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i];
}
return 0;
}

badsig:
/* Signature verification failed */
*mlen = (size_t) -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 29
- 0
crypto_sign/dilithium/dilithium2/clean/sign.h 查看文件

@@ -0,0 +1,29 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H
#define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

+ 26
- 0
crypto_sign/dilithium/dilithium2/clean/symmetric-shake.c 查看文件

@@ -0,0 +1,26 @@
#include "fips202.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake128_inc_init(state);
shake128_inc_absorb(state, seed, SEEDBYTES);
shake128_inc_absorb(state, t, 2);
shake128_inc_finalize(state);
}

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake256_inc_init(state);
shake256_inc_absorb(state, seed, CRHBYTES);
shake256_inc_absorb(state, t, 2);
shake256_inc_finalize(state);
}

+ 36
- 0
crypto_sign/dilithium/dilithium2/clean/symmetric.h 查看文件

@@ -0,0 +1,36 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H
#include "fips202.h"
#include "params.h"
#include <stdint.h>



typedef shake128incctx stream128_state;
typedef shake256incctx stream256_state;

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state,
const uint8_t seed[CRHBYTES],
uint16_t nonce);

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE)
#define stream128_release(STATE) shake128_inc_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE)
#define stream256_release(STATE) shake256_inc_ctx_release(STATE)


#endif

+ 31
- 0
crypto_sign/dilithium/dilithium3/META.yml 查看文件

@@ -0,0 +1,31 @@
name: Dilithium3
type: signature
claimed-nist-level: 3
length-public-key: 1952
length-secret-key: 4016
length-signature: 3293
nistkat-sha256: d0d4bb6945e14206d17b52f8a395d5a750ec8a73f2ea06b9f1cd226d225a9bfb
testvectors-sha256: 531b85dbecaeaf135ad9004c8e2d5ce163b8e72d9c3a537e15bd383cf5f38aa4
principal-submitters:
- Vadim Lyubashevsky
auxiliary-submitters:
- Léo Ducas
- Eike Kiltz
- Tancrède Lepoint
- Peter Schwabe
- Gregor Seiler
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
- name: avx2
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- aes
- avx2
- popcnt

+ 5
- 0
crypto_sign/dilithium/dilithium3/avx2/LICENSE 查看文件

@@ -0,0 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 19
- 0
crypto_sign/dilithium/dilithium3/avx2/align.h 查看文件

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_ALIGN_H
#define PQCLEAN_DILITHIUM3_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[N]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGNED_INT32(N) \
union { \
int32_t coeffs[N]; \
__m256i vec[((N)+7)/8]; \
}

#endif

+ 32
- 0
crypto_sign/dilithium/dilithium3/avx2/api.h 查看文件

@@ -0,0 +1,32 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_API_H
#define PQCLEAN_DILITHIUM3_AVX2_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES 1952
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES 4016
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES 3293

#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3"


int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

+ 24
- 0
crypto_sign/dilithium/dilithium3/avx2/cdecl.h 查看文件

@@ -0,0 +1,24 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL_H
#define PQCLEAN_DILITHIUM3_AVX2_CDECL_H



#define _8XQ 0
#define _8XQINV 8
#define _8XDIV_QINV 16
#define _8XDIV 24
#define _ZETAS_QINV 32
#define _ZETAS 328

/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/

#define _cdecl(s) _##s
#define cdecl(s) s

#endif

+ 101
- 0
crypto_sign/dilithium/dilithium3/avx2/consts.c 查看文件

@@ -0,0 +1,101 @@
#include "consts.h"
#include "params.h"
#include <stdint.h>

#define QINV 58728449 // q^(-1) mod 2^32
#define MONT (-4186625) // 2^32 mod q
#define DIV 41978 // mont^2/256
#define DIV_QINV (-8395782)

const qdata_t PQCLEAN_DILITHIUM3_AVX2_qdata = {{
//#define _8XQ 0
Q, Q, Q, Q, Q, Q, Q, Q,

//#define _8XQINV 8
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

//#define _8XDIV_QINV 16
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV,

//#define _8XDIV 24
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV,

//#define _ZETAS_QINV 32
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244,
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077,
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561,
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417,
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735,
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904,
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771,
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600,
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139,
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433,
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547,
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852,
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995,
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424,
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315,
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951,
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031,
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878,
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606,
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568,
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583,
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093,
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172,
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187,
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462,
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722,
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279,
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342,
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272,
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682,
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363,
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473,
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426,
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762,
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494,
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853,
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238,

//#define _ZETAS 328
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451,
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905,
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855,
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103,
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928,
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549,
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672,
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928,
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771,
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969,
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969,
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922,
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430,
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856,
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961,
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995,
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100,
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149,
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738,
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098,
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455,
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634,
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424,
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622,
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115,
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233,
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154,
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838,
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642,
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107,
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782,
}
};

+ 10
- 0
crypto_sign/dilithium/dilithium3/avx2/consts.h 查看文件

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_CONSTS_H
#define PQCLEAN_DILITHIUM3_AVX2_CONSTS_H
#include "align.h"
#include "cdecl.h"


typedef ALIGNED_INT32(624) qdata_t;
extern const qdata_t PQCLEAN_DILITHIUM3_AVX2_qdata;

#endif

+ 909
- 0
crypto_sign/dilithium/dilithium3/avx2/f1600x4.S 查看文件

@@ -0,0 +1,909 @@
/* Taken from Bas Westerbaan's new 4-way SHAKE implementation
* for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/),
* but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */

#include "cdecl.h"

.data
.p2align 5
rho8:
.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14
rho56:
.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8

.text
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4)
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4)
cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4):
_cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4):
vmovdqa rho8(%rip), %ymm0
movq $6, %rax
looptop:
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 192(%rdi), %ymm4, %ymm9
vpxor 384(%rdi), %ymm3, %ymm10
vpxor 576(%rdi), %ymm2, %ymm11
vpxor 768(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 0(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 96(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 320(%rdi), %ymm5, %ymm10
vpxor 512(%rdi), %ymm4, %ymm11
vpxor 704(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 32(%rdi), %ymm4, %ymm8
vpxor 224(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 608(%rdi), %ymm1, %ymm11
vpxor 640(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 128(%rdi), %ymm1, %ymm8
vpxor 160(%rdi), %ymm5, %ymm9
vpxor 352(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 736(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 64(%rdi), %ymm3, %ymm8
vpxor 256(%rdi), %ymm2, %ymm9
vpxor 448(%rdi), %ymm1, %ymm10
vpxor 480(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 448(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 512(%rdi), %ymm4, %ymm9
vpxor 224(%rdi), %ymm3, %ymm10
vpxor 736(%rdi), %ymm2, %ymm11
vpxor 448(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 8(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 576(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 640(%rdi), %ymm5, %ymm10
vpxor 352(%rdi), %ymm4, %ymm11
vpxor 64(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 192(%rdi), %ymm4, %ymm8
vpxor 704(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 128(%rdi), %ymm1, %ymm11
vpxor 480(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 768(%rdi), %ymm1, %ymm8
vpxor 320(%rdi), %ymm5, %ymm9
vpxor 32(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 256(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 384(%rdi), %ymm3, %ymm8
vpxor 96(%rdi), %ymm2, %ymm9
vpxor 608(%rdi), %ymm1, %ymm10
vpxor 160(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 608(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 352(%rdi), %ymm4, %ymm9
vpxor 704(%rdi), %ymm3, %ymm10
vpxor 256(%rdi), %ymm2, %ymm11
vpxor 608(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 16(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 736(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 480(%rdi), %ymm5, %ymm10
vpxor 32(%rdi), %ymm4, %ymm11
vpxor 384(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 512(%rdi), %ymm4, %ymm8
vpxor 64(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 768(%rdi), %ymm1, %ymm11
vpxor 160(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 448(%rdi), %ymm1, %ymm8
vpxor 640(%rdi), %ymm5, %ymm9
vpxor 192(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 96(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 224(%rdi), %ymm3, %ymm8
vpxor 576(%rdi), %ymm2, %ymm9
vpxor 128(%rdi), %ymm1, %ymm10
vpxor 320(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 128(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 32(%rdi), %ymm4, %ymm9
vpxor 64(%rdi), %ymm3, %ymm10
vpxor 96(%rdi), %ymm2, %ymm11
vpxor 128(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 24(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 256(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 160(%rdi), %ymm5, %ymm10
vpxor 192(%rdi), %ymm4, %ymm11
vpxor 224(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 352(%rdi), %ymm4, %ymm8
vpxor 384(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 448(%rdi), %ymm1, %ymm11
vpxor 320(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 608(%rdi), %ymm1, %ymm8
vpxor 480(%rdi), %ymm5, %ymm9
vpxor 512(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 576(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 704(%rdi), %ymm3, %ymm8
vpxor 736(%rdi), %ymm2, %ymm9
vpxor 768(%rdi), %ymm1, %ymm10
vpxor 640(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 768(%rdi)
addq $32, %rsi
subq $1, %rax
jnz looptop
ret

+ 219
- 0
crypto_sign/dilithium/dilithium3/avx2/fips202x4.c 查看文件

@@ -0,0 +1,219 @@
#include "fips202.h"
#include "fips202x4.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#define NROUNDS 24

/* Keccak round constants */
static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
(uint64_t)0x0000000000000001ULL,
(uint64_t)0x0000000000008082ULL,
(uint64_t)0x800000000000808aULL,
(uint64_t)0x8000000080008000ULL,
(uint64_t)0x000000000000808bULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008009ULL,
(uint64_t)0x000000000000008aULL,
(uint64_t)0x0000000000000088ULL,
(uint64_t)0x0000000080008009ULL,
(uint64_t)0x000000008000000aULL,
(uint64_t)0x000000008000808bULL,
(uint64_t)0x800000000000008bULL,
(uint64_t)0x8000000000008089ULL,
(uint64_t)0x8000000000008003ULL,
(uint64_t)0x8000000000008002ULL,
(uint64_t)0x8000000000000080ULL,
(uint64_t)0x000000000000800aULL,
(uint64_t)0x800000008000000aULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008080ULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008008ULL
};

static void keccakx4_absorb_once(__m256i s[25],
unsigned int r,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen,
uint8_t p) {
size_t i;
uint64_t pos = 0;
__m256i t, idx;

for (i = 0; i < 25; ++i) {
s[i] = _mm256_setzero_si256();
}

idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
while (inlen >= r) {
for (i = 0; i < r / 8; ++i) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= r;

PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants);
}

for (i = 0; i < inlen / 8; ++i) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= 8 * i;

if (inlen) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1));
t = _mm256_and_si256(t, idx);
s[i] = _mm256_xor_si256(s[i], t);
}

t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen);
s[i] = _mm256_xor_si256(s[i], t);
t = _mm256_set1_epi64x((long long)(1ULL << 63));
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t);
}

static void keccakx4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
unsigned int r,
__m256i s[25]) {
unsigned int i;
__m128d t;

while (nblocks > 0) {
PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
}

out0 += r;
out1 += r;
out2 += r;
out3 += r;
--nblocks;
}
}

void PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
}

void PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
}

void PQCLEAN_DILITHIUM3_AVX2_shake128x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
unsigned int i;
size_t nblocks = outlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
keccakx4_state state;

PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE128_RATE;
out1 += nblocks * SHAKE128_RATE;
out2 += nblocks * SHAKE128_RATE;
out3 += nblocks * SHAKE128_RATE;
outlen -= nblocks * SHAKE128_RATE;

if (outlen) {
PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

void PQCLEAN_DILITHIUM3_AVX2_shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
unsigned int i;
size_t nblocks = outlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
keccakx4_state state;

PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE256_RATE;
out1 += nblocks * SHAKE256_RATE;
out2 += nblocks * SHAKE256_RATE;
out3 += nblocks * SHAKE256_RATE;
outlen -= nblocks * SHAKE256_RATE;

if (outlen) {
PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

+ 64
- 0
crypto_sign/dilithium/dilithium3/avx2/fips202x4.h 查看文件

@@ -0,0 +1,64 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H
#define PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H

#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>

typedef struct {
__m256i s[25];
} keccakx4_state;

void PQCLEAN_DILITHIUM3_AVX2_f1600x4(__m256i *s, const uint64_t *rc);

void PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_DILITHIUM3_AVX2_shake128x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM3_AVX2_shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

#endif

+ 240
- 0
crypto_sign/dilithium/dilithium3/avx2/invntt.S 查看文件

@@ -0,0 +1,240 @@
#include "cdecl.h"
.include "shuffle.inc"

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpsubd %ymm\l,%ymm\h,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vpmuldq %ymm\zl0,%ymm12,%ymm13
vmovshdup %ymm12,%ymm\h
vpmuldq %ymm\zl1,%ymm\h,%ymm14

vpmuldq %ymm\zh0,%ymm12,%ymm12
vpmuldq %ymm\zh1,%ymm\h,%ymm\h

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vpsubd %ymm13,%ymm12,%ymm12
vpsubd %ymm14,%ymm\h,%ymm\h

vmovshdup %ymm12,%ymm12
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h
.endm

.macro levels0t5 off
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

/* level 0 */
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,5,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 6,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,9,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 10,11,1,3,2,15

/* level 1 */
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,6,1,3,2,15
butterfly 5,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,10,1,3,2,15
butterfly 9,11,1,3,2,15

/* level 2 */
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,8,1,3,2,15
butterfly 5,9,1,3,2,15
butterfly 6,10,1,3,2,15
butterfly 7,11,1,3,2,15

/* level 3 */
shuffle2 4,5,3,5
shuffle2 6,7,4,7
shuffle2 8,9,6,9
shuffle2 10,11,8,11

vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2
butterfly 3,5
butterfly 4,7
butterfly 6,9
butterfly 8,11

/* level 4 */
shuffle4 3,4,10,4
shuffle4 6,8,3,8
shuffle4 5,7,6,7
shuffle4 9,11,5,11

vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2
butterfly 10,4
butterfly 3,8
butterfly 6,7
butterfly 5,11

/* level 5 */
shuffle8 10,3,9,3
shuffle8 6,5,10,5
shuffle8 4,8,6,8
shuffle8 7,11,4,11

vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2
butterfly 9,3
butterfly 10,5
butterfly 6,8
butterfly 4,11

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm10,256*\off+ 32(%rdi)
vmovdqa %ymm6,256*\off+ 64(%rdi)
vmovdqa %ymm4,256*\off+ 96(%rdi)
vmovdqa %ymm3,256*\off+128(%rdi)
vmovdqa %ymm5,256*\off+160(%rdi)
vmovdqa %ymm8,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

.macro levels6t7 off
vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

/* level 6 */
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

/* level 7 */
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)

vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1
vmovdqa (_8XDIV)*4(%rsi),%ymm2
vpmuldq %ymm1,%ymm4,%ymm12
vpmuldq %ymm1,%ymm5,%ymm13
vmovshdup %ymm4,%ymm8
vmovshdup %ymm5,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm4,%ymm4
vpmuldq %ymm2,%ymm5,%ymm5
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm4,%ymm4
vpsubd %ymm13,%ymm5,%ymm5
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm4,%ymm4
vmovshdup %ymm5,%ymm5
vpblendd $0xAA,%ymm8,%ymm4,%ymm4
vpblendd $0xAA,%ymm9,%ymm5,%ymm5

vpmuldq %ymm1,%ymm6,%ymm12
vpmuldq %ymm1,%ymm7,%ymm13
vmovshdup %ymm6,%ymm8
vmovshdup %ymm7,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm6,%ymm6
vpmuldq %ymm2,%ymm7,%ymm7
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm6,%ymm6
vpsubd %ymm13,%ymm7,%ymm7
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm6,%ymm6
vmovshdup %ymm7,%ymm7
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
vpblendd $0xAA,%ymm9,%ymm7,%ymm7

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
.endm

.text
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx):
_cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t5 0
levels0t5 1
levels0t5 2
levels0t5 3

levels6t7 0
levels6t7 1
levels6t7 2
levels6t7 3

ret

+ 199
- 0
crypto_sign/dilithium/dilithium3/avx2/ntt.S 查看文件

@@ -0,0 +1,199 @@
#include "cdecl.h"
.include "shuffle.inc"

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpmuldq %ymm\zl0,%ymm\h,%ymm13
vmovshdup %ymm\h,%ymm12
vpmuldq %ymm\zl1,%ymm12,%ymm14

vpmuldq %ymm\zh0,%ymm\h,%ymm\h
vpmuldq %ymm\zh1,%ymm12,%ymm12

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vmovshdup %ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h

vpsubd %ymm\h,%ymm\l,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vmovshdup %ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13

vpaddd %ymm13,%ymm12,%ymm\h
vpsubd %ymm13,%ymm\l,%ymm\l
.endm

.macro levels0t1 off
/* level 0 */
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2

vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

/* level 1 */
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)
.endm

.macro levels2t7 off
/* level 2 */
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

/* level 3 */
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2

butterfly 3,5
butterfly 8,10
butterfly 4,6
butterfly 9,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

/* level 4 */
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2

butterfly 7,8
butterfly 5,6
butterfly 3,4
butterfly 10,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

/* level 5 */
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15

butterfly 9,5,1,10,2,15
butterfly 8,4,1,10,2,15
butterfly 7,3,1,10,2,15
butterfly 6,11,1,10,2,15

/* level 6 */
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,7,1,10,2,15
butterfly 8,6,1,10,2,15

vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,3,1,10,2,15
butterfly 4,11,1,10,2,15

/* level 7 */
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,8,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 7,6,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,4,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 3,11,1,10,2,15

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm8,256*\off+ 32(%rdi)
vmovdqa %ymm7,256*\off+ 64(%rdi)
vmovdqa %ymm6,256*\off+ 96(%rdi)
vmovdqa %ymm5,256*\off+128(%rdi)
vmovdqa %ymm4,256*\off+160(%rdi)
vmovdqa %ymm3,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

.text
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx):
_cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t1 0
levels0t1 1
levels0t1 2
levels0t1 3

levels2t7 0
levels2t7 1
levels2t7 2
levels2t7 3

ret


+ 14
- 0
crypto_sign/dilithium/dilithium3/avx2/ntt.h 查看文件

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_NTT_H
#define PQCLEAN_DILITHIUM3_AVX2_NTT_H

#include <immintrin.h>

void PQCLEAN_DILITHIUM3_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata);
void PQCLEAN_DILITHIUM3_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata);

void PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx(__m256i *a);

void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata);
void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata);

#endif

+ 261
- 0
crypto_sign/dilithium/dilithium3/avx2/packing.c 查看文件

@@ -0,0 +1,261 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sk
*
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sk
*
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sig
*
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3_AVX2_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sig
*
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3_AVX2_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

return 0;
}

+ 31
- 0
crypto_sign/dilithium/dilithium3/avx2/packing.h 查看文件

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_PACKING_H
#define PQCLEAN_DILITHIUM3_AVX2_PACKING_H
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM3_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM3_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM3_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]);

int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]);

#endif

+ 41
- 0
crypto_sign/dilithium/dilithium3/avx2/params.h 查看文件

@@ -0,0 +1,41 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_PARAMS_H
#define PQCLEAN_DILITHIUM3_AVX2_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define D 13
#define ROOT_OF_UNITY 1753

#define K 6
#define L 5
#define ETA 4
#define TAU 49
#define BETA 196
#define GAMMA1 (1 << 19)
#define GAMMA2 ((Q-1)/32)
#define OMEGA 55
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 640

#define POLYW1_PACKEDBYTES 128

#define POLYETA_PACKEDBYTES 128

#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 201
- 0
crypto_sign/dilithium/dilithium3/avx2/pointwise.S 查看文件

@@ -0,0 +1,201 @@
#include "params.h"
#include "cdecl.h"

.text
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
_cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
#consts
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop1:
#load
vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa 64(%rsi),%ymm6
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vmovdqa 64(%rdx),%ymm14
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vmovshdup %ymm6,%ymm7
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vmovshdup %ymm14,%ymm15

#mul
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5
vpmuldq %ymm6,%ymm14,%ymm6
vpmuldq %ymm7,%ymm15,%ymm7

#reduce
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm0,%ymm6,%ymm14
vpmuldq %ymm0,%ymm7,%ymm15
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpmuldq %ymm1,%ymm14,%ymm14
vpmuldq %ymm1,%ymm15,%ymm15
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsubq %ymm14,%ymm6,%ymm6
vpsubq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vmovshdup %ymm6,%ymm6

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4
vpblendd $0xAA,%ymm7,%ymm6,%ymm6
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm6,64(%rdi)

add $96,%rdi
add $96,%rsi
add $96,%rdx
add $1,%eax
cmp $10,%eax
jb _looptop1

vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5

#reduce
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vmovshdup %ymm4,%ymm4

#store
vpblendd $0x55,%ymm2,%ymm3,%ymm2
vpblendd $0x55,%ymm4,%ymm5,%ymm4
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

ret

.macro pointwise off
#load
vmovdqa \off(%rsi),%ymm6
vmovdqa \off+32(%rsi),%ymm8
vmovdqa \off(%rdx),%ymm10
vmovdqa \off+32(%rdx),%ymm12
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuldq %ymm6,%ymm10,%ymm6
vpmuldq %ymm7,%ymm11,%ymm7
vpmuldq %ymm8,%ymm12,%ymm8
vpmuldq %ymm9,%ymm13,%ymm9
.endm

.macro acc
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm7,%ymm3,%ymm3
vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5
.endm

.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
_cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
#consts
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop2:
pointwise 0

#mov
vmovdqa %ymm6,%ymm2
vmovdqa %ymm7,%ymm3
vmovdqa %ymm8,%ymm4
vmovdqa %ymm9,%ymm5

pointwise 1024
acc

pointwise 2048
acc

pointwise 3072
acc

pointwise 4096
acc



#reduce
vpmuldq %ymm0,%ymm2,%ymm6
vpmuldq %ymm0,%ymm3,%ymm7
vpmuldq %ymm0,%ymm4,%ymm8
vpmuldq %ymm0,%ymm5,%ymm9
vpmuldq %ymm1,%ymm6,%ymm6
vpmuldq %ymm1,%ymm7,%ymm7
vpmuldq %ymm1,%ymm8,%ymm8
vpmuldq %ymm1,%ymm9,%ymm9
vpsubq %ymm6,%ymm2,%ymm2
vpsubq %ymm7,%ymm3,%ymm3
vpsubq %ymm8,%ymm4,%ymm4
vpsubq %ymm9,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vmovshdup %ymm4,%ymm4

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4

vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

add $64,%rsi
add $64,%rdx
add $64,%rdi
add $1,%eax
cmp $16,%eax
jb _looptop2

ret

+ 998
- 0
crypto_sign/dilithium/dilithium3/avx2/poly.c
文件差异内容过多而无法显示
查看文件


+ 79
- 0
crypto_sign/dilithium/dilithium3/avx2/poly.h 查看文件

@@ -0,0 +1,79 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_POLY_H
#define PQCLEAN_DILITHIUM3_AVX2_POLY_H
#include "align.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

typedef ALIGNED_INT32(N) poly;

void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM3_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t seed[CRHBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);

void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]);

void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a);

#endif

+ 498
- 0
crypto_sign/dilithium/dilithium3/avx2/polyvec.c 查看文件

@@ -0,0 +1,498 @@
#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

#define UNUSED(x) (void)x

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
polyvecl tmp;
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(&mat[0], &mat[1], rho);
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(&mat[1], &mat[2], rho);
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(&mat[2], &mat[3], rho);
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho);
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(&mat[4], &mat[5], rho);
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(&mat[5], &tmp, rho);
}

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[4], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 4, 256, 257, 258);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[2]);
}

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowb->vec[0], &rowb->vec[1], rho, 259, 260, 512, 513);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]);
}

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[2], &rowa->vec[3], &rowa->vec[4], &rowb->vec[0], rho, 514, 515, 516, 768);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]);
}

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[1], &rowa->vec[2], &rowa->vec[3], &rowa->vec[4], rho, 769, 770, 771, 772);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]);
}

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 1024, 1025, 1026, 1027);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[4], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 1028, 1280, 1281, 1282);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[2]);
}

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowb->vec[0], &rowb->vec[1], rho, 1283, 1284, 1536, 1537);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]);
PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]);
}


void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i);
}
}

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) {
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM3_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM3_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - uint8_t *hint: pointer to output hint array
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) {
unsigned int i, n = 0;

for (i = 0; i < K; ++i) {
n += PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]);
}

return n;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 72
- 0
crypto_sign/dilithium/dilithium3/avx2/polyvec.h 查看文件

@@ -0,0 +1,72 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H
#define PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H
#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B);

/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM3_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 392
- 0
crypto_sign/dilithium/dilithium3/avx2/rejsample.c 查看文件

@@ -0,0 +1,392 @@
#include "params.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>

const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
unsigned int ctr, pos;
uint32_t good;
__m256i d, tmp;
const __m256i bound = _mm256_set1_epi32(Q);
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
-1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6,
-1, 5, 4, 3, -1, 2, 1, 0);

ctr = pos = 0;
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
d = _mm256_permute4x64_epi64(d, 0x94);
d = _mm256_shuffle_epi8(d, idx8);
d = _mm256_and_si256(d, mask);
pos += 24;

tmp = _mm256_sub_epi32(d, bound);
good = _mm256_movemask_ps((__m256)tmp);
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good]));
d = _mm256_permutevar8x32_epi32(d, tmp);

_mm256_storeu_si256((__m256i *)&r[ctr], d);
ctr += _mm_popcnt_u32(good);

if (ctr > N - 8) {
break;
}
}

uint32_t t;
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
r[ctr++] = t;
}
}

return ctr;
}

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
unsigned int ctr, pos;
uint32_t good;
__m256i f0, f1;
__m128i g0, g1;
const __m256i mask = _mm256_set1_epi8(15);
const __m256i eta = _mm256_set1_epi8(4);
const __m256i bound = _mm256_set1_epi8(9);

ctr = pos = 0;
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
f1 = _mm256_slli_epi16(f0, 4);
f0 = _mm256_or_si256(f0, f1);
f0 = _mm256_and_si256(f0, mask);

f1 = _mm256_sub_epi8(f0, bound);
f0 = _mm256_sub_epi8(eta, f0);
good = _mm256_movemask_epi8(f1);

g0 = _mm256_castsi256_si128(f0);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm256_extracti128_si256(f0, 1);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good);
pos += 4;
}

uint32_t t0, t1;
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 < 9) {
r[ctr++] = 4 - t0;
}
if (t1 < 9 && ctr < N) {
r[ctr++] = 4 - t1;
}
}

return ctr;
}

+ 19
- 0
crypto_sign/dilithium/dilithium3/avx2/rejsample.h 查看文件

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_REJSAMPLE_H
#define PQCLEAN_DILITHIUM3_AVX2_REJSAMPLE_H
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES)

#define REJ_UNIFORM_ETA_NBLOCKS ((228+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)

extern const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8];

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]);

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]);

#endif

+ 154
- 0
crypto_sign/dilithium/dilithium3/avx2/rounding.c 查看文件

@@ -0,0 +1,154 @@
#include "consts.h"
#include "params.h"
#include "rejsample.h"
#include "rounding.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

#define _mm256_blendv_epi32(a,b,mask) \
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
_mm256_castsi256_ps(b), \
_mm256_castsi256_ps(mask)))

/*************************************************
* Name: power2round
*
* Description: For finite field elements a, compute a0, a1 such that
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be positive standard representative.
*
* Arguments: - __m256i *a1: output array of length N/8 with high bits
* - __m256i *a0: output array of length N/8 with low bits a0
* - const __m256i *a: input array of length N/8
*
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1;
const __m256i mask = _mm256_set1_epi32(-(1 << D));
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1);

for (i = 0; i < N / 8; ++i) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, half);
f0 = _mm256_and_si256(f1, mask);
f1 = _mm256_srli_epi32(f1, D);
f0 = _mm256_sub_epi32(f, f0);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: decompose
*
* Description: For finite field element a, compute high and low parts a0, a1 such
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard
* representative.
*
* Arguments: - __m256i *a1: output array of length N/8 with high parts
* - __m256i *a0: output array of length N/8 with low parts a0
* - const __m256i *a: input array of length N/8
*
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1;
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3_AVX2_qdata.vec[_8XQ / 8]);
const __m256i hq = _mm256_srli_epi32(q, 1);
const __m256i v = _mm256_set1_epi32(1025);
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2);
const __m256i off = _mm256_set1_epi32(127);
const __m256i shift = _mm256_set1_epi32(512);
const __m256i mask = _mm256_set1_epi32(15);

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, off);
f1 = _mm256_srli_epi32(f1, 7);
f1 = _mm256_mulhi_epu16(f1, v);
f1 = _mm256_mulhrs_epi16(f1, shift);
f1 = _mm256_and_si256(f1, mask);
f0 = _mm256_mullo_epi32(f1, alpha);
f0 = _mm256_sub_epi32(f, f0);
f = _mm256_cmpgt_epi32(f0, hq);
f = _mm256_and_si256(f, q);
f0 = _mm256_sub_epi32(f0, f);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}


/*************************************************
* Name: make_hint
*
* Description: Compute indices of polynomial coefficients whose low bits
* overflow into the high bits.
*
* Arguments: - uint8_t *hint: hint array
* - const __m256i *a0: low bits of input elements
* - const __m256i *a1: high bits of input elements
*
* Returns number of overflowing low bits
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) {
unsigned int i, n = 0;
__m256i f0, f1, g0, g1;
uint32_t bad;
uint64_t idx;
const __m256i low = _mm256_set1_epi32(-GAMMA2);
const __m256i high = _mm256_set1_epi32(GAMMA2);

for (i = 0; i < N / 8; ++i) {
f0 = _mm256_load_si256(&a0[i]);
f1 = _mm256_load_si256(&a1[i]);
g0 = _mm256_abs_epi32(f0);
g0 = _mm256_cmpgt_epi32(g0, high);
g1 = _mm256_cmpeq_epi32(f0, low);
g1 = _mm256_sign_epi32(g1, f1);
g0 = _mm256_or_si256(g0, g1);

bad = _mm256_movemask_ps((__m256)g0);
memcpy(&idx, PQCLEAN_DILITHIUM3_AVX2_idxlut[bad], 8);
idx += (uint64_t)0x0808080808080808 * i;
memcpy(&hint[n], &idx, 8);
n += _mm_popcnt_u32(bad);
}

return n;
}

/*************************************************
* Name: use_hint
*
* Description: Correct high parts according to hint.
*
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts
* - const __m256i *a: input array of length N/8
* - const __m256i *a: input array of length N/8 with hint bits
*
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) {
unsigned int i;
__m256i a0[N / 8];
__m256i f, g, h, t;
const __m256i zero = _mm256_setzero_si256();
const __m256i mask = _mm256_set1_epi32(15);

PQCLEAN_DILITHIUM3_AVX2_decompose_avx(b, a0, a);
for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a0[i]);
g = _mm256_load_si256(&b[i]);
h = _mm256_load_si256(&hint[i]);
t = _mm256_blendv_epi32(zero, h, f);
t = _mm256_slli_epi32(t, 1);
h = _mm256_sub_epi32(h, t);
g = _mm256_add_epi32(g, h);
g = _mm256_and_si256(g, mask);
_mm256_store_si256(&b[i], g);
}
}

+ 12
- 0
crypto_sign/dilithium/dilithium3/avx2/rounding.h 查看文件

@@ -0,0 +1,12 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_ROUNDING_H
#define PQCLEAN_DILITHIUM3_AVX2_ROUNDING_H
#include "params.h"
#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM3_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a);
void PQCLEAN_DILITHIUM3_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a);
unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1);
void PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint);

#endif

+ 54
- 0
crypto_sign/dilithium/dilithium3/avx2/shuffle.S 查看文件

@@ -0,0 +1,54 @@
#include "cdecl.h"
.include "shuffle.inc"

.text
nttunpack128_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

#store
vmovdqa %ymm9,(%rdi)
vmovdqa %ymm8,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm6,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm3,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx)
.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx):
_cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx):
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
ret

+ 25
- 0
crypto_sign/dilithium/dilithium3/avx2/shuffle.inc 查看文件

@@ -0,0 +1,25 @@
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

+ 425
- 0
crypto_sign/dilithium/dilithium3/avx2/sign.c 查看文件

@@ -0,0 +1,425 @@
#include "align.h"
#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"
#include <stdint.h>
#include <string.h>

static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) {
switch (i) {
case 0:
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho);
*row = buf;
break;
case 1:
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho);
*row = buf + 1;
break;
case 2:
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho);
*row = buf;
break;
case 3:
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho);
*row = buf + 1;
break;
case 4:
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(buf, buf + 1, rho);
*row = buf;
break;
case 5:
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(buf + 1, buf, rho);
*row = buf + 1;
break;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
uint8_t seedbuf[3 * SEEDBYTES];
const uint8_t *rho, *rhoprime, *key;
polyvecl rowbuf[2];
polyvecl s1, *row = rowbuf;
polyveck s2;
poly t1, t0;

/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Store rho, key */
memcpy(pk, rho, SEEDBYTES);
memcpy(sk, rho, SEEDBYTES);
memcpy(sk + SEEDBYTES, key, SEEDBYTES);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[4], &s2.vec[0], &s2.vec[1], &s2.vec[2], rhoprime, 4, 5, 6, 7);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s2.vec[3], &s2.vec[4], &s2.vec[5], &t0, rhoprime, 8, 9, 10, 11);

/* Pack secret vectors */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]);
}
for (i = 0; i < K; i++) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]);
}

/* Transform s1 */
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1);


for (i = 0; i < K; i++) {
/* Expand matrix row */
polyvec_matrix_expand_row(&row, rowbuf, rho, i);

/* Compute inner-product */
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&t1);

/* Add error polynomial */
PQCLEAN_DILITHIUM3_AVX2_poly_add(&t1, &t1, &s2.vec[i]);

/* Round t and pack t1, t0 */
PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&t1);
PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1);
PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0);
}

/* Compute CRH(rho, t1) and store in secret key */
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature
*
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
unsigned int i, n, pos;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint8_t hintbuf[N];
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
uint64_t nonce = 0;
polyvecl mat[K], s1, z;
polyveck t0, s2, w1;
poly c, tmp;
union {
polyvecl y;
polyveck w0;
} tmpv;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM3_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

/* Compute CRH(tr, msg) */
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t0);


rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3],
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(&z.vec[4], rhoprime, nonce + 4);
nonce += 5;

/* Matrix-vector product */
tmpv.y = z;
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&tmpv.y);
PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y);
PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(sig, &w1);

shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM3_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&c);

/* Compute z, reject if it reveals secret */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM3_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp);
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&z.vec[i]);
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) {
goto rej;
}
}

/* Zero hint vector in signature */
pos = 0;
memset(hint, 0, OMEGA);

for (i = 0; i < K; i++) {
/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM3_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&tmpv.w0.vec[i]);
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) {
goto rej;
}

/* Compute hints */
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&tmp);
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&tmp, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM3_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
n = PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]);
if (pos + n > OMEGA) {
goto rej;
}

/* Store hints in signature */
memcpy(&hint[pos], hintbuf, n);
hint[OMEGA + i] = pos = pos + n;
}

/* Pack z into signature */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]);
}

*siglen = PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
size_t i;

for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify
*
* Description: Verifies signature.
*
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) {
unsigned int i, j, pos = 0;
/* PQCLEAN_DILITHIUM3_AVX2_polyw1_pack writes additional 14 bytes */
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf;
uint8_t mu[CRHBYTES];
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
polyvecl rowbuf[2];
polyvecl *row = rowbuf;
polyvecl z;
poly c, w1, h;
shake256incctx state;

if (siglen != PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES);
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

/* Expand PQCLEAN_DILITHIUM3_AVX2_challenge */
PQCLEAN_DILITHIUM3_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&c);

/* Unpack z; shortness follows from unpacking */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES);
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&z.vec[i]);
}


for (i = 0; i < K; i++) {
/* Expand matrix row */
polyvec_matrix_expand_row(&row, rowbuf, pk, i);

/* Compute i-th row of Az - c2^Dt1 */
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z);

PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES);
PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&h);
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&h);
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&h, &c, &h);

PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w1, &w1, &h);
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&w1);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&w1);

/* Get hint polynomial and reconstruct w1 */
memset(h.vec, 0, sizeof(poly));
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) {
return -1;
}

for (j = pos; j < hint[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > pos && hint[j] <= hint[j - 1]) {
return -1;
}
h.coeffs[hint[j]] = 1;
}
pos = hint[OMEGA + i];

PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&w1);
PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1);
}

/* Extra indices are zero for strong unforgeability */
for (j = pos; j < OMEGA; ++j) {
if (hint[j]) {
return -1;
}
}

/* Call random oracle and verify PQCLEAN_DILITHIUM3_AVX2_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (buf.coeffs[i] != sig[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) {
goto badsig;
}

*mlen = smlen - PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + i];
}
return 0;
}

badsig:
/* Signature verification failed */
*mlen = -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 29
- 0
crypto_sign/dilithium/dilithium3/avx2/sign.h 查看文件

@@ -0,0 +1,29 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_SIGN_H
#define PQCLEAN_DILITHIUM3_AVX2_SIGN_H
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

+ 26
- 0
crypto_sign/dilithium/dilithium3/avx2/symmetric-shake.c 查看文件

@@ -0,0 +1,26 @@
#include "fips202.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake128_inc_init(state);
shake128_inc_absorb(state, seed, SEEDBYTES);
shake128_inc_absorb(state, t, 2);
shake128_inc_finalize(state);
}

void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake256_inc_init(state);
shake256_inc_absorb(state, seed, CRHBYTES);
shake256_inc_absorb(state, t, 2);
shake256_inc_finalize(state);
}

+ 36
- 0
crypto_sign/dilithium/dilithium3/avx2/symmetric.h 查看文件

@@ -0,0 +1,36 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H
#define PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H
#include "fips202.h"
#include "params.h"
#include <stdint.h>



typedef shake128incctx stream128_state;
typedef shake256incctx stream256_state;

void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(shake128incctx *state,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(shake256incctx *state,
const uint8_t seed[CRHBYTES],
uint16_t nonce);

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE)
#define stream128_release(STATE) shake128_inc_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE)
#define stream256_release(STATE) shake256_inc_ctx_release(STATE)


#endif

+ 5
- 0
crypto_sign/dilithium/dilithium3/clean/LICENSE 查看文件

@@ -0,0 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 23
- 0
crypto_sign/dilithium/dilithium3/clean/Makefile.Microsoft_nmake 查看文件

@@ -0,0 +1,23 @@
# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libdilithium3_clean.lib
OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as
# long as there has been a C standard, and we need it for constant-time
# computations. Thus, we disable that spurious warning.
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146

all: $(LIBRARY)

# Make sure objects are recompiled if headers change.
$(OBJECTS): *.h

$(LIBRARY): $(OBJECTS)
LIB.EXE /NOLOGO /WX /OUT:$@ $**

clean:
-DEL $(OBJECTS)
-DEL $(LIBRARY)

+ 32
- 0
crypto_sign/dilithium/dilithium3/clean/api.h 查看文件

@@ -0,0 +1,32 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_API_H
#define PQCLEAN_DILITHIUM3_CLEAN_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES 1952
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES 4016
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES 3293

#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3"


int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

+ 98
- 0
crypto_sign/dilithium/dilithium3/clean/ntt.c 查看文件

@@ -0,0 +1,98 @@
#include "ntt.h"
#include "params.h"
#include "reduce.h"
#include <stdint.h>

static const int32_t zetas[N] = {
0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103,
2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549,
-2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221,
-1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922,
3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047,
-671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430,
-3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618,
-3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856,
189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330,
1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961,
2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462,
266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378,
900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500,
-655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838,
342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044,
2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974,
-3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970,
-1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642,
-1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031,
-542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993,
-2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385,
-3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107,
-3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078,
-426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893,
-2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687,
-554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782
};

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_ntt
*
* Description: Forward NTT, in-place. No modular reduction is performed after
* additions or subtractions. Output vector is in bitreversed order.
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_ntt(int32_t a[N]) {
unsigned int len, start, j, k;
int32_t zeta, t;

k = 0;
for (len = 128; len > 0; len >>= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas[++k];
for (j = start; j < start + len; ++j) {
t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]);
a[j + len] = a[j] - t;
a[j] = a[j] + t;
}
}
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont
*
* Description: Inverse NTT and multiplication by Montgomery factor 2^32.
* In-place. No modular reductions after additions or
* subtractions; input coefficients need to be smaller than
* Q in absolute value. Output coefficient are smaller than Q in
* absolute value.
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(int32_t a[N]) {
unsigned int start, len, j, k;
int32_t t, zeta;
const int32_t f = 41978; // mont^2/256

k = 256;
for (len = 1; len < N; len <<= 1) {
for (start = 0; start < N; start = j + len) {
zeta = -zetas[--k];
for (j = start; j < start + len; ++j) {
t = a[j];
a[j] = t + a[j + len];
a[j + len] = t - a[j + len];
a[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]);
}
}
}

for (j = 0; j < N; ++j) {
a[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)f * a[j]);
}
}

+ 10
- 0
crypto_sign/dilithium/dilithium3/clean/ntt.h 查看文件

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_NTT_H
#define PQCLEAN_DILITHIUM3_CLEAN_NTT_H
#include "params.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM3_CLEAN_ntt(int32_t a[N]);

void PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(int32_t a[N]);

#endif

+ 261
- 0
crypto_sign/dilithium/dilithium3/clean/packing.c 查看文件

@@ -0,0 +1,261 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sk
*
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sk
*
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sig
*
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3_CLEAN_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sig
*
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3_CLEAN_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

return 0;
}

+ 31
- 0
crypto_sign/dilithium/dilithium3/clean/packing.h 查看文件

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_PACKING_H
#define PQCLEAN_DILITHIUM3_CLEAN_PACKING_H
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]);

int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]);

#endif

+ 41
- 0
crypto_sign/dilithium/dilithium3/clean/params.h 查看文件

@@ -0,0 +1,41 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H
#define PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define D 13
#define ROOT_OF_UNITY 1753

#define K 6
#define L 5
#define ETA 4
#define TAU 49
#define BETA 196
#define GAMMA1 (1 << 19)
#define GAMMA2 ((Q-1)/32)
#define OMEGA 55
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 640

#define POLYW1_PACKEDBYTES 128

#define POLYETA_PACKEDBYTES 128

#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 818
- 0
crypto_sign/dilithium/dilithium3/clean/poly.c 查看文件

@@ -0,0 +1,818 @@
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "rounding.h"
#include "symmetric.h"
#include <stdint.h>

#define DBENCH_START()
#define DBENCH_STOP(t)

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_reduce
*
* Description: Inplace reduction of all coefficients of polynomial to
* representative in [-6283009,6283007].
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a->coeffs[i]);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_caddq
*
* Description: For all coefficients of in/out polynomial add Q if
* coefficient is negative.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_caddq(a->coeffs[i]);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_freeze
*
* Description: Inplace reduction of all coefficients of polynomial to
* standard representatives.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_freeze(a->coeffs[i]);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_add
*
* Description: Add polynomials. No modular reduction is performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
}

DBENCH_STOP(*tadd);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_sub
*
* Description: Subtract polynomials. No modular reduction is
* performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
}

DBENCH_STOP(*tadd);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl
*
* Description: Multiply polynomial by 2^D without modular reduction. Assumes
* input coefficients to be less than 2^{31-D} in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a->coeffs[i] <<= D;
}

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_ntt
*
* Description: Inplace forward NTT. Coefficients can grow by
* 8*Q in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM3_CLEAN_ntt(a->coeffs);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont
*
* Description: Inplace inverse NTT and multiplication by 2^{32}.
* Input coefficients need to be less than Q in absolute
* value and output coefficients are again bounded by Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(a->coeffs);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery
*
* Description: Pointwise multiplication of polynomials in NTT domain
* representation and multiplication of resulting polynomial
* by 2^{-32}.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
c->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
}

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_power2round
*
* Description: For all coefficients c of the input polynomial,
* compute c0, c1 such that c mod Q = c1*2^D + c0
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients c0
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]);
}

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_decompose
*
* Description: For all coefficients c of the input polynomial,
* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients c0
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]);
}

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint
*
* Description: Compute hint polynomial. The coefficients of which indicate
* whether the low bits of the corresponding coefficient of
* the input polynomial overflow into the high bits.
*
* Arguments: - poly *h: pointer to output hint polynomial
* - const poly *a0: pointer to low part of input polynomial
* - const poly *a1: pointer to high part of input polynomial
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) {
unsigned int i, s = 0;
DBENCH_START();

for (i = 0; i < N; ++i) {
h->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]);
s += h->coeffs[i];
}

DBENCH_STOP(*tround);
return s;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint
*
* Description: Use hint polynomial to correct the high bits of a polynomial.
*
* Arguments: - poly *b: pointer to output polynomial with corrected high bits
* - const poly *a: pointer to input polynomial
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N; ++i) {
b->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]);
}

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm
*
* Description: Check infinity norm of polynomial against given bound.
* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM3_CLEAN_reduce32().
*
* Arguments: - const poly *a: pointer to polynomial
* - int32_t B: norm bound
*
* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, int32_t B) {
unsigned int i;
int32_t t;
DBENCH_START();

if (B > (Q - 1) / 8) {
return 1;
}

/* It is ok to leak which coefficient violates the bound since
the probability for each coefficient is independent of secret
data but we must not leak the sign of the centralized representative. */
for (i = 0; i < N; ++i) {
/* Absolute value */
t = a->coeffs[i] >> 31;
t = a->coeffs[i] - (t & 2 * a->coeffs[i]);

if (t >= B) {
DBENCH_STOP(*tsample);
return 1;
}
}

DBENCH_STOP(*tsample);
return 0;
}

/*************************************************
* Name: rej_uniform
*
* Description: Sample uniformly random coefficients in [0, Q-1] by
* performing rejection sampling on array of random bytes.
*
* Arguments: - int32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_uniform(int32_t *a,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t;
DBENCH_START();

ctr = pos = 0;
while (ctr < len && pos + 3 <= buflen) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
a[ctr++] = t;
}
}

DBENCH_STOP(*tsample);
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform
*
* Description: Sample polynomial with uniformly random coefficients
* in [0,Q-1] by performing rejection sampling on the
* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES;
uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);

ctr = rej_uniform(a->coeffs, N, buf, buflen);

while (ctr < N) {
off = buflen % 3;
for (i = 0; i < off; ++i) {
buf[i] = buf[buflen - off + i];
}

stream128_squeezeblocks(buf + off, 1, &state);
buflen = STREAM128_BLOCKBYTES + off;
ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen);
}
stream128_release(&state);
}

/*************************************************
* Name: rej_eta
*
* Description: Sample uniformly random coefficients in [-ETA, ETA] by
* performing rejection sampling on array of random bytes.
*
* Arguments: - int32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_eta(int32_t *a,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
DBENCH_START();

ctr = pos = 0;
while (ctr < len && pos < buflen) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 < 9) {
a[ctr++] = 4 - t0;
}
if (t1 < 9 && ctr < len) {
a[ctr++] = 4 - t1;
}
}

DBENCH_STOP(*tsample);
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta
*
* Description: Sample polynomial with uniformly random coefficients
* in [-ETA,ETA] by performing rejection sampling on the
* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce) {
unsigned int ctr;
unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES;
uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);

ctr = rej_eta(a->coeffs, N, buf, buflen);

while (ctr < N) {
stream128_squeezeblocks(buf, 1, &state);
ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES);
}
stream128_release(&state);
}

/*************************************************
* Name: poly_uniform_gamma1m1
*
* Description: Sample polynomial with uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(poly *a,
const uint8_t seed[CRHBYTES],
uint16_t nonce) {
uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES];
stream256_state state;

stream256_init(&state, seed, nonce);
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
stream256_release(&state);
PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(a, buf);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_challenge
*
* Description: Implementation of H. Samples polynomial with TAU nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(seed).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) {
unsigned int i, b, pos;
uint64_t signs;
uint8_t buf[SHAKE256_RATE];
shake256incctx state;

shake256_inc_init(&state);
shake256_inc_absorb(&state, seed, SEEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf, sizeof buf, &state);

signs = 0;
for (i = 0; i < 8; ++i) {
signs |= (uint64_t)buf[i] << 8 * i;
}
pos = 8;

for (i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}
for (i = N - TAU; i < N; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_inc_squeeze(buf, sizeof buf, &state);
pos = 0;
}

b = buf[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1 - 2 * (signs & 1);
signs >>= 1;
}
shake256_inc_ctx_release(&state);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack
*
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYETA_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a) {
unsigned int i;
uint8_t t[8];
DBENCH_START();

for (i = 0; i < N / 2; ++i) {
t[0] = (uint8_t) (ETA - a->coeffs[2 * i + 0]);
t[1] = (uint8_t) (ETA - a->coeffs[2 * i + 1]);
r[i] = t[0] | (t[1] << 4);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack
*
* Description: Unpack polynomial with coefficients in [-ETA,ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[i] & 0x0F;
r->coeffs[2 * i + 1] = a[i] >> 4;
r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYT1_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0);
r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2));
r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4));
r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6));
r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack
*
* Description: Unpack polynomial t1 with 10-bit coefficients.
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF;
r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF;
r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF;
r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF;
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack
*
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYT0_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a) {
unsigned int i;
uint32_t t[8];
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0];
t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1];
t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2];
t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3];
t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4];
t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5];
t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6];
t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7];

r[13 * i + 0] = (uint8_t) t[0];
r[13 * i + 1] = (uint8_t) (t[0] >> 8);
r[13 * i + 1] |= (uint8_t) (t[1] << 5);
r[13 * i + 2] = (uint8_t) (t[1] >> 3);
r[13 * i + 3] = (uint8_t) (t[1] >> 11);
r[13 * i + 3] |= (uint8_t) (t[2] << 2);
r[13 * i + 4] = (uint8_t) (t[2] >> 6);
r[13 * i + 4] |= (uint8_t) (t[3] << 7);
r[13 * i + 5] = (uint8_t) (t[3] >> 1);
r[13 * i + 6] = (uint8_t) (t[3] >> 9);
r[13 * i + 6] |= (uint8_t) (t[4] << 4);
r[13 * i + 7] = (uint8_t) (t[4] >> 4);
r[13 * i + 8] = (uint8_t) (t[4] >> 12);
r[13 * i + 8] |= (uint8_t) (t[5] << 1);
r[13 * i + 9] = (uint8_t) (t[5] >> 7);
r[13 * i + 9] |= (uint8_t) (t[6] << 6);
r[13 * i + 10] = (uint8_t) (t[6] >> 2);
r[13 * i + 11] = (uint8_t) (t[6] >> 10);
r[13 * i + 11] |= (uint8_t) (t[7] << 3);
r[13 * i + 12] = (uint8_t) (t[7] >> 5);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack
*
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = a[13 * i + 0];
r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8;
r->coeffs[8 * i + 0] &= 0x1FFF;

r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5;
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3;
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11;
r->coeffs[8 * i + 1] &= 0x1FFF;

r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2;
r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6;
r->coeffs[8 * i + 2] &= 0x1FFF;

r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7;
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1;
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9;
r->coeffs[8 * i + 3] &= 0x1FFF;

r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4;
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4;
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12;
r->coeffs[8 * i + 4] &= 0x1FFF;

r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1;
r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7;
r->coeffs[8 * i + 5] &= 0x1FFF;

r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6;
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2;
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10;
r->coeffs[8 * i + 6] &= 0x1FFF;

r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3;
r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5;
r->coeffs[8 * i + 7] &= 0x1FFF;

r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0];
r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1];
r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2];
r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3];
r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4];
r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5];
r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6];
r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyz_pack
*
* Description: Bit-pack polynomial with coefficients
* in [-(GAMMA1 - 1), GAMMA1].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYZ_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a) {
unsigned int i;
uint32_t t[4];
DBENCH_START();

for (i = 0; i < N / 2; ++i) {
t[0] = GAMMA1 - a->coeffs[2 * i + 0];
t[1] = GAMMA1 - a->coeffs[2 * i + 1];

r[5 * i + 0] = (uint8_t) t[0];
r[5 * i + 1] = (uint8_t) (t[0] >> 8);
r[5 * i + 2] = (uint8_t) (t[0] >> 16);
r[5 * i + 2] |= (uint8_t) (t[1] << 4);
r[5 * i + 3] = (uint8_t) (t[1] >> 4);
r[5 * i + 4] = (uint8_t) (t[1] >> 12);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack
*
* Description: Unpack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[5 * i + 0];
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16;
r->coeffs[2 * i + 0] &= 0xFFFFF;

r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;
r->coeffs[2 * i + 0] &= 0xFFFFF;

r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack
*
* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYW1_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 2; ++i) {
r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4));
}

DBENCH_STOP(*tpack);
}

+ 53
- 0
crypto_sign/dilithium/dilithium3/clean/poly.h 查看文件

@@ -0,0 +1,53 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLY_H
#define PQCLEAN_DILITHIUM3_CLEAN_POLY_H
#include "params.h"
#include <stdint.h>

typedef struct {
int32_t coeffs[N];
} poly;

void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(poly *a,
const uint8_t seed[CRHBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a);

#endif

+ 448
- 0
crypto_sign/dilithium/dilithium3/clean/polyvec.c 查看文件

@@ -0,0 +1,448 @@
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
unsigned int i, j;

for (i = 0; i < K; ++i) {
for (j = 0; j < L; ++j) {
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j));
}
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i));
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v) {
unsigned int i;
poly t;

PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
for (i = 1; i < L; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]);
PQCLEAN_DILITHIUM3_CLEAN_poly_add(w, w, &t);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}


/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - polyveck *h: pointer to output vector
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1) {
unsigned int i, s = 0;

for (i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
}

return s;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 68
- 0
crypto_sign/dilithium/dilithium3/clean/polyvec.h 查看文件

@@ -0,0 +1,68 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H
#define PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H
#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);


int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B);



/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 69
- 0
crypto_sign/dilithium/dilithium3/clean/reduce.c 查看文件

@@ -0,0 +1,69 @@
#include "params.h"
#include "reduce.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce
*
* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
*
* Arguments: - int64_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(int64_t a) {
int32_t t;

t = (int32_t)((uint64_t)a * (uint64_t)QINV);
t = (a - (int64_t)t * Q) >> 32;
return t;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_reduce32
*
* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007.
*
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(int32_t a) {
int32_t t;

t = (a + (1 << 22)) >> 23;
t = a - t * Q;
return t;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_caddq
*
* Description: Add Q if input coefficient is negative.
*
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_caddq(int32_t a) {
a += (a >> 31) & Q;
return a;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_freeze
*
* Description: For finite field element a, compute standard
* representative r = a mod^+ Q.
*
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(int32_t a) {
a = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a);
a = PQCLEAN_DILITHIUM3_CLEAN_caddq(a);
return a;
}

+ 17
- 0
crypto_sign/dilithium/dilithium3/clean/reduce.h 查看文件

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H
#define PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H
#include "params.h"
#include <stdint.h>

#define MONT (-4186625) // 2^32 % Q
#define QINV 58728449 // q^(-1) mod 2^32

int32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(int64_t a);

int32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(int32_t a);

int32_t PQCLEAN_DILITHIUM3_CLEAN_caddq(int32_t a);

int32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(int32_t a);

#endif

+ 92
- 0
crypto_sign/dilithium/dilithium3/clean/rounding.c 查看文件

@@ -0,0 +1,92 @@
#include "params.h"
#include "rounding.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be standard representative.
*
* Arguments: - int32_t a: input element
* - int32_t *a0: pointer to output element a0
*
* Returns a1.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(int32_t *a0, int32_t a) {
int32_t a1;

a1 = (a + (1 << (D - 1)) - 1) >> D;
*a0 = a - (a1 << D);
return a1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
* representative.
*
* Arguments: - int32_t a: input element
* - int32_t *a0: pointer to output element a0
*
* Returns a1.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(int32_t *a0, int32_t a) {
int32_t a1;

a1 = (a + 127) >> 7;
a1 = (a1 * 1025 + (1 << 21)) >> 22;
a1 &= 15;

*a0 = a - a1 * 2 * GAMMA2;
*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q;
return a1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits.
*
* Arguments: - int32_t a0: low bits of input element
* - int32_t a1: high bits of input element
*
* Returns 1 if overflow.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(int32_t a0, int32_t a1) {
if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) {
return 1;
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_use_hint
*
* Description: Correct high bits according to hint.
*
* Arguments: - int32_t a: input element
* - unsigned int hint: hint bit
*
* Returns corrected high bits.
**************************************************/
int32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(int32_t a, unsigned int hint) {
int32_t a0, a1;

a1 = PQCLEAN_DILITHIUM3_CLEAN_decompose(&a0, a);
if (hint == 0) {
return a1;
}

if (a0 > 0) {
return (a1 + 1) & 15;
}
return (a1 - 1) & 15;
}

+ 14
- 0
crypto_sign/dilithium/dilithium3/clean/rounding.h 查看文件

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H
#define PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H
#include "params.h"
#include <stdint.h>

int32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(int32_t *a0, int32_t a);

int32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(int32_t *a0, int32_t a);

unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(int32_t a0, int32_t a1);

int32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(int32_t a, unsigned int hint);

#endif

+ 343
- 0
crypto_sign/dilithium/dilithium3/clean/sign.c 查看文件

@@ -0,0 +1,343 @@
#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
polyvecl mat[K];
polyvecl s1, s1hat;
polyveck s2, t1, t0;

/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Expand matrix */
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L);

/* Matrix-vector multiplication */
s1hat = s1;
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&s1hat);
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&t1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&t1);

/* Add error vector s2 */
PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&t1, &t1, &s2);

/* Extract t1 and write public key */
PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&t1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM3_CLEAN_pack_pk(pk, rho, &t1);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM3_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature
*
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(uint8_t *sig,
size_t *siglen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
unsigned int n;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint16_t nonce = 0;
polyvecl mat[K], s1, y, z;
polyveck t0, s2, w1, w0, h;
poly cp;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

/* Compute CRH(tr, msg) */
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&t0);

rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++);

/* Matrix-vector multiplication */
z = y;
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&z);
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(&w1, &w0, &w1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(sig, &w1);

shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(&cp, sig);
PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&cp);

/* Compute z, reject if it reveals secret */
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1);
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(&z);
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(&z, &z, &y);
PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(&z);
if (PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
goto rej;
}

/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&h);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w0, &w0, &h);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w0);
if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}

/* Compute hints for w1 */
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&h);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&h);
if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&h, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&w0, &w0, &h);
n = PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(&h, &w0, &w1);
if (n > OMEGA) {
goto rej;
}

/* Write signature */
PQCLEAN_DILITHIUM3_CLEAN_pack_sig(sig, sig, &z, &h);
*siglen = PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm,
size_t *smlen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
size_t i;

for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify
*
* Description: Verifies signature.
*
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(const uint8_t *sig,
size_t siglen,
const uint8_t *m,
size_t mlen,
const uint8_t *pk) {
unsigned int i;
uint8_t buf[K * POLYW1_PACKEDBYTES];
uint8_t rho[SEEDBYTES];
uint8_t mu[CRHBYTES];
uint8_t c[SEEDBYTES];
uint8_t c2[SEEDBYTES];
poly cp;
polyvecl mat[K], z;
polyveck t1, w1, h;
shake256incctx state;

if (siglen != PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) {
return -1;
}

PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(rho, &t1, pk);
if (PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(c, &z, &h, sig)) {
return -1;
}
if (PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES);
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(&cp, c);
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho);

PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&z);
PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);

PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&cp);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(&t1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&t1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);

PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w1, &w1, &t1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&w1);

/* Reconstruct w1 */
PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(buf, &w1);

/* Call random oracle and verify PQCLEAN_DILITHIUM3_CLEAN_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(c2, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (c[i] != c2[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m,
size_t *mlen,
const uint8_t *sm,
size_t smlen,
const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) {
goto badsig;
}

*mlen = smlen - PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + i];
}
return 0;
}

badsig:
/* Signature verification failed */
*mlen = (size_t) -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 29
- 0
crypto_sign/dilithium/dilithium3/clean/sign.h 查看文件

@@ -0,0 +1,29 @@
#ifndef PQCLEAN_DILITHIUM3_CLEAN_SIGN_H
#define PQCLEAN_DILITHIUM3_CLEAN_SIGN_H
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存