Переглянути джерело

Round 3 update for Dilithium (from github source) (#369)

* Update Dilithium

* Alternative montgomery reduce to avoid i386 functest errors

* Explicit casts for msvc

* More casts; bump upstream version; fix metadata

* another cast
master
John Schanck 3 роки тому
committed by GitHub
джерело
коміт
4f86c39515
Не вдалося знайти GPG ключ що відповідає даному підпису Ідентифікатор GPG ключа: 4AEE18F83AFDEB23
100 змінених файлів з 9519 додано та 3742 видалено
  1. +71
    -68
      .github/workflows/BADGES.md
  2. +204
    -0
      .github/workflows/sign_dilithium2aes.yml
  3. +204
    -0
      .github/workflows/sign_dilithium3aes.yml
  4. +11
    -11
      .github/workflows/sign_dilithium5.yml
  5. +204
    -0
      .github/workflows/sign_dilithium5aes.yml
  6. +0
    -0
     
  7. +16
    -16
      crypto_sign/dilithium2/META.yml
  8. +3
    -4
      crypto_sign/dilithium2/avx2/LICENSE
  9. +11
    -20
      crypto_sign/dilithium2/avx2/Makefile
  10. +19
    -0
      crypto_sign/dilithium2/avx2/align.h
  11. +0
    -22
      crypto_sign/dilithium2/avx2/alignment.h
  12. +10
    -17
      crypto_sign/dilithium2/avx2/api.h
  13. +12
    -6
      crypto_sign/dilithium2/avx2/cdecl.h
  14. +101
    -0
      crypto_sign/dilithium2/avx2/consts.c
  15. +10
    -0
      crypto_sign/dilithium2/avx2/consts.h
  16. +909
    -0
      crypto_sign/dilithium2/avx2/f1600x4.S
  17. +173
    -187
      crypto_sign/dilithium2/avx2/fips202x4.c
  18. +49
    -51
      crypto_sign/dilithium2/avx2/fips202x4.h
  19. +233
    -275
      crypto_sign/dilithium2/avx2/invntt.S
  20. +177
    -157
      crypto_sign/dilithium2/avx2/ntt.S
  21. +8
    -30
      crypto_sign/dilithium2/avx2/ntt.h
  22. +0
    -80
      crypto_sign/dilithium2/avx2/nttconsts.c
  23. +0
    -27
      crypto_sign/dilithium2/avx2/nttconsts.h
  24. +108
    -144
      crypto_sign/dilithium2/avx2/packing.c
  25. +23
    -34
      crypto_sign/dilithium2/avx2/packing.h
  26. +26
    -14
      crypto_sign/dilithium2/avx2/params.h
  27. +80
    -71
      crypto_sign/dilithium2/avx2/pointwise.S
  28. +665
    -539
      crypto_sign/dilithium2/avx2/poly.c
  29. +35
    -39
      crypto_sign/dilithium2/avx2/poly.h
  30. +218
    -67
      crypto_sign/dilithium2/avx2/polyvec.c
  31. +42
    -28
      crypto_sign/dilithium2/avx2/polyvec.h
  32. +0
    -93
      crypto_sign/dilithium2/avx2/reduce.S
  33. +0
    -9
      crypto_sign/dilithium2/avx2/reduce.h
  34. +111
    -144
      crypto_sign/dilithium2/avx2/rejsample.c
  35. +13
    -19
      crypto_sign/dilithium2/avx2/rejsample.h
  36. +119
    -77
      crypto_sign/dilithium2/avx2/rounding.c
  37. +7
    -7
      crypto_sign/dilithium2/avx2/rounding.h
  38. +54
    -0
      crypto_sign/dilithium2/avx2/shuffle.S
  39. +10
    -8
      crypto_sign/dilithium2/avx2/shuffle.inc
  40. +248
    -266
      crypto_sign/dilithium2/avx2/sign.c
  41. +22
    -8
      crypto_sign/dilithium2/avx2/sign.h
  42. +0
    -26
      crypto_sign/dilithium2/avx2/stream.c
  43. +0
    -15
      crypto_sign/dilithium2/avx2/stream.h
  44. +26
    -0
      crypto_sign/dilithium2/avx2/symmetric-shake.c
  45. +23
    -12
      crypto_sign/dilithium2/avx2/symmetric.h
  46. +3
    -4
      crypto_sign/dilithium2/clean/LICENSE
  47. +3
    -6
      crypto_sign/dilithium2/clean/Makefile
  48. +8
    -3
      crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake
  49. +10
    -17
      crypto_sign/dilithium2/clean/api.h
  50. +61
    -101
      crypto_sign/dilithium2/clean/ntt.c
  51. +3
    -4
      crypto_sign/dilithium2/clean/ntt.h
  52. +108
    -144
      crypto_sign/dilithium2/clean/packing.c
  53. +23
    -34
      crypto_sign/dilithium2/clean/packing.h
  54. +26
    -14
      crypto_sign/dilithium2/clean/params.h
  55. +456
    -293
      crypto_sign/dilithium2/clean/poly.c
  56. +24
    -37
      crypto_sign/dilithium2/clean/poly.h
  57. +191
    -79
      crypto_sign/dilithium2/clean/polyvec.c
  58. +36
    -26
      crypto_sign/dilithium2/clean/polyvec.h
  59. +25
    -31
      crypto_sign/dilithium2/clean/reduce.c
  60. +7
    -11
      crypto_sign/dilithium2/clean/reduce.h
  61. +43
    -62
      crypto_sign/dilithium2/clean/rounding.c
  62. +8
    -5
      crypto_sign/dilithium2/clean/rounding.h
  63. +138
    -222
      crypto_sign/dilithium2/clean/sign.c
  64. +22
    -5
      crypto_sign/dilithium2/clean/sign.h
  65. +0
    -26
      crypto_sign/dilithium2/clean/stream.c
  66. +0
    -15
      crypto_sign/dilithium2/clean/stream.h
  67. +26
    -0
      crypto_sign/dilithium2/clean/symmetric-shake.c
  68. +23
    -12
      crypto_sign/dilithium2/clean/symmetric.h
  69. +31
    -0
      crypto_sign/dilithium2aes/META.yml
  70. +5
    -0
      crypto_sign/dilithium2aes/avx2/LICENSE
  71. +23
    -0
      crypto_sign/dilithium2aes/avx2/Makefile
  72. +142
    -0
      crypto_sign/dilithium2aes/avx2/aes256ctr.c
  73. +29
    -0
      crypto_sign/dilithium2aes/avx2/aes256ctr.h
  74. +19
    -0
      crypto_sign/dilithium2aes/avx2/align.h
  75. +31
    -0
      crypto_sign/dilithium2aes/avx2/api.h
  76. +24
    -0
      crypto_sign/dilithium2aes/avx2/cdecl.h
  77. +101
    -0
      crypto_sign/dilithium2aes/avx2/consts.c
  78. +10
    -0
      crypto_sign/dilithium2aes/avx2/consts.h
  79. +240
    -0
      crypto_sign/dilithium2aes/avx2/invntt.S
  80. +199
    -0
      crypto_sign/dilithium2aes/avx2/ntt.S
  81. +14
    -0
      crypto_sign/dilithium2aes/avx2/ntt.h
  82. +261
    -0
      crypto_sign/dilithium2aes/avx2/packing.c
  83. +31
    -0
      crypto_sign/dilithium2aes/avx2/packing.h
  84. +41
    -0
      crypto_sign/dilithium2aes/avx2/params.h
  85. +199
    -0
      crypto_sign/dilithium2aes/avx2/pointwise.S
  86. +891
    -0
      crypto_sign/dilithium2aes/avx2/poly.c
  87. +52
    -0
      crypto_sign/dilithium2aes/avx2/poly.h
  88. +449
    -0
      crypto_sign/dilithium2aes/avx2/polyvec.c
  89. +64
    -0
      crypto_sign/dilithium2aes/avx2/polyvec.h
  90. +394
    -0
      crypto_sign/dilithium2aes/avx2/rejsample.c
  91. +19
    -0
      crypto_sign/dilithium2aes/avx2/rejsample.h
  92. +157
    -0
      crypto_sign/dilithium2aes/avx2/rounding.c
  93. +12
    -0
      crypto_sign/dilithium2aes/avx2/rounding.h
  94. +54
    -0
      crypto_sign/dilithium2aes/avx2/shuffle.S
  95. +25
    -0
      crypto_sign/dilithium2aes/avx2/shuffle.inc
  96. +425
    -0
      crypto_sign/dilithium2aes/avx2/sign.c
  97. +29
    -0
      crypto_sign/dilithium2aes/avx2/sign.h
  98. +25
    -0
      crypto_sign/dilithium2aes/avx2/symmetric.h
  99. +5
    -0
      crypto_sign/dilithium2aes/clean/LICENSE
  100. +19
    -0
      crypto_sign/dilithium2aes/clean/Makefile

+ 71
- 68
.github/workflows/BADGES.md Переглянути файл

@@ -1,88 +1,91 @@
![Test sphincs-haraka-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-robust/badge.svg?branch=master)
![Test sphincs-haraka-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-simple/badge.svg?branch=master)
![Test sphincs-sha256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-simple/badge.svg?branch=master)
![Test sphincs-haraka-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-robust/badge.svg?branch=master)
![Test sphincs-sha256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-simple/badge.svg?branch=master)
![Test dilithium2](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2/badge.svg?branch=master)
![Test sphincs-shake256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-robust/badge.svg?branch=master)
![Test rainbowIII-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-compressed/badge.svg?branch=master)
![Test sphincs-haraka-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-simple/badge.svg?branch=master)
![Test sphincs-sha256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-robust/badge.svg?branch=master)
![Test sphincs-haraka-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-simple/badge.svg?branch=master)
![Test sphincs-haraka-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-robust/badge.svg?branch=master)
![Test rainbowV-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-circumzenithal/badge.svg?branch=master)
![Test sphincs-shake256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-simple/badge.svg?branch=master)
![Test sphincs-shake256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-robust/badge.svg?branch=master)
![Test rainbowIII-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-circumzenithal/badge.svg?branch=master)
![Test sphincs-sha256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-robust/badge.svg?branch=master)
![Test sphincs-haraka-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192f-simple/badge.svg?branch=master)
![Test sphincs-shake256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-simple/badge.svg?branch=master)
![Test sphincs-sha256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-robust/badge.svg?branch=master)
![Test sphincs-shake256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-robust/badge.svg?branch=master)
![Test sphincs-haraka-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-simple/badge.svg?branch=master)
![Test sphincs-sha256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-simple/badge.svg?branch=master)
![Test sphincs-sha256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-simple/badge.svg?branch=master)
![Test rainbowIII-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-classic/badge.svg?branch=master)
![Test sphincs-shake256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-simple/badge.svg?branch=master)
![Test rainbowI-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-circumzenithal/badge.svg?branch=master)
![Test sphincs-sha256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-robust/badge.svg?branch=master)
![Test rainbowV-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-compressed/badge.svg?branch=master)
![Test rainbowV-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-classic/badge.svg?branch=master)
![Test sphincs-sha256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-simple/badge.svg?branch=master)
![Test falcon-512](https://github.com/PQClean/PQClean/workflows/Test%20falcon-512/badge.svg?branch=master)
![Test falcon-1024](https://github.com/PQClean/PQClean/workflows/Test%20falcon-1024/badge.svg?branch=master)
![Test sphincs-haraka-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-simple/badge.svg?branch=master)
![Test sphincs-shake256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-robust/badge.svg?branch=master)
![Test dilithium4](https://github.com/PQClean/PQClean/workflows/Test%20dilithium4/badge.svg?branch=master)
![Test sphincs-sha256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-robust/badge.svg?branch=master)
![Test sphincs-haraka-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-robust/badge.svg?branch=master)
![Test sphincs-shake256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-simple/badge.svg?branch=master)
![Test rainbowI-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-classic/badge.svg?branch=master)
![Test sphincs-haraka-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-robust/badge.svg?branch=master)
![Test sphincs-sha256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-robust/badge.svg?branch=master)
![Test sphincs-shake256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-robust/badge.svg?branch=master)
![Test rainbowI-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-compressed/badge.svg?branch=master)
![Test rainbowIII-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-classic/badge.svg?branch=master)
![Test sphincs-sha256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-simple/badge.svg?branch=master)
![Test sphincs-haraka-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192f-robust/badge.svg?branch=master)
![Test sphincs-haraka-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-robust/badge.svg?branch=master)
![Test sphincs-shake256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-robust/badge.svg?branch=master)
![Test sphincs-haraka-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-simple/badge.svg?branch=master)
![Test sphincs-haraka-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-simple/badge.svg?branch=master)
![Test sphincs-sha256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-robust/badge.svg?branch=master)
![Test sphincs-shake256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-simple/badge.svg?branch=master)
![Test sphincs-sha256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-robust/badge.svg?branch=master)
![Test dilithium3](https://github.com/PQClean/PQClean/workflows/Test%20dilithium3/badge.svg?branch=master)
![Test rainbowI-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-compressed/badge.svg?branch=master)
![Test dilithium2](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2/badge.svg?branch=master)
![Test sphincs-sha256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-robust/badge.svg?branch=master)
![Test sphincs-shake256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-robust/badge.svg?branch=master)
![Test dilithium5](https://github.com/PQClean/PQClean/workflows/Test%20dilithium5/badge.svg?branch=master)
![Test sphincs-haraka-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-simple/badge.svg?branch=master)
![Test sphincs-haraka-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-robust/badge.svg?branch=master)
![Test sphincs-shake256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-simple/badge.svg?branch=master)
![Test sphincs-sha256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-simple/badge.svg?branch=master)
![Test sphincs-shake256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-robust/badge.svg?branch=master)
![Test sphincs-shake256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-simple/badge.svg?branch=master)
![Test rainbowV-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-classic/badge.svg?branch=master)
![Test sphincs-sha256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-robust/badge.svg?branch=master)
![Test sphincs-shake256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-simple/badge.svg?branch=master)
![Test sphincs-shake256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-simple/badge.svg?branch=master)
![Test sphincs-haraka-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-robust/badge.svg?branch=master)
![Test sphincs-shake256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-robust/badge.svg?branch=master)
![Test sphincs-shake256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-simple/badge.svg?branch=master)
![Test rainbowV-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-compressed/badge.svg?branch=master)
![Test sphincs-sha256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-simple/badge.svg?branch=master)
![Test dilithium5aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium5aes/badge.svg?branch=master)
![Test dilithium2aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2aes/badge.svg?branch=master)
![Test sphincs-sha256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-simple/badge.svg?branch=master)
![Test mceliece460896](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896/badge.svg?branch=master)
![Test saber](https://github.com/PQClean/PQClean/workflows/Test%20saber/badge.svg?branch=master)
![Test kyber1024-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024-90s/badge.svg?branch=master)
![Test kyber1024](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024/badge.svg?branch=master)
![Test mceliece8192128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128/badge.svg?branch=master)
![Test sphincs-sha256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-simple/badge.svg?branch=master)
![Test rainbowIII-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-compressed/badge.svg?branch=master)
![Test sphincs-sha256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-robust/badge.svg?branch=master)
![Test falcon-512](https://github.com/PQClean/PQClean/workflows/Test%20falcon-512/badge.svg?branch=master)
![Test sphincs-shake256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-robust/badge.svg?branch=master)
![Test falcon-1024](https://github.com/PQClean/PQClean/workflows/Test%20falcon-1024/badge.svg?branch=master)
![Test sphincs-haraka-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-simple/badge.svg?branch=master)
![Test sphincs-shake256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-robust/badge.svg?branch=master)
![Test sphincs-shake256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-simple/badge.svg?branch=master)
![Test sphincs-shake256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-robust/badge.svg?branch=master)
![Test sphincs-haraka-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-robust/badge.svg?branch=master)
![Test sphincs-sha256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-robust/badge.svg?branch=master)
![Test sphincs-haraka-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-simple/badge.svg?branch=master)
![Test dilithium3aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium3aes/badge.svg?branch=master)
![Test kyber512-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber512-90s/badge.svg?branch=master)
![Test firesaber](https://github.com/PQClean/PQClean/workflows/Test%20firesaber/badge.svg?branch=master)
![Test frodokem1344aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344aes/badge.svg?branch=master)
![Test sntrup653](https://github.com/PQClean/PQClean/workflows/Test%20sntrup653/badge.svg?branch=master)
![Test mceliece6688128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128/badge.svg?branch=master)
![Test ntrulpr761](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr761/badge.svg?branch=master)
![Test frodokem976aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976aes/badge.svg?branch=master)
![Test hqc-rmrs-192](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-192/badge.svg?branch=master)
![Test sntrup857](https://github.com/PQClean/PQClean/workflows/Test%20sntrup857/badge.svg?branch=master)
![Test frodokem640aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640aes/badge.svg?branch=master)
![Test hqc-rmrs-128](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-128/badge.svg?branch=master)
![Test mceliece6960119](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119/badge.svg?branch=master)
![Test ntruhrss701](https://github.com/PQClean/PQClean/workflows/Test%20ntruhrss701/badge.svg?branch=master)
![Test ntrulpr857](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr857/badge.svg?branch=master)
![Test frodokem1344shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344shake/badge.svg?branch=master)
![Test mceliece6688128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128f/badge.svg?branch=master)
![Test ntruhps2048677](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048677/badge.svg?branch=master)
![Test frodokem640aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640aes/badge.svg?branch=master)
![Test sntrup761](https://github.com/PQClean/PQClean/workflows/Test%20sntrup761/badge.svg?branch=master)
![Test hqc-rmrs-256](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-256/badge.svg?branch=master)
![Test frodokem976shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976shake/badge.svg?branch=master)
![Test mceliece348864](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864/badge.svg?branch=master)
![Test frodokem1344aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344aes/badge.svg?branch=master)
![Test hqc-rmrs-192](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-192/badge.svg?branch=master)
![Test frodokem976aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976aes/badge.svg?branch=master)
![Test mceliece8192128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128f/badge.svg?branch=master)
![Test mceliece460896f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896f/badge.svg?branch=master)
![Test kyber512-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber512-90s/badge.svg?branch=master)
![Test kyber1024](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024/badge.svg?branch=master)
![Test mceliece348864f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864f/badge.svg?branch=master)
![Test mceliece6960119f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119f/badge.svg?branch=master)
![Test firesaber](https://github.com/PQClean/PQClean/workflows/Test%20firesaber/badge.svg?branch=master)
![Test sntrup857](https://github.com/PQClean/PQClean/workflows/Test%20sntrup857/badge.svg?branch=master)
![Test frodokem640shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640shake/badge.svg?branch=master)
![Test sntrup761](https://github.com/PQClean/PQClean/workflows/Test%20sntrup761/badge.svg?branch=master)
![Test ntruhps4096821](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps4096821/badge.svg?branch=master)
![Test ntruhrss701](https://github.com/PQClean/PQClean/workflows/Test%20ntruhrss701/badge.svg?branch=master)
![Test mceliece348864](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864/badge.svg?branch=master)
![Test ntrulpr653](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr653/badge.svg?branch=master)
![Test sntrup653](https://github.com/PQClean/PQClean/workflows/Test%20sntrup653/badge.svg?branch=master)
![Test lightsaber](https://github.com/PQClean/PQClean/workflows/Test%20lightsaber/badge.svg?branch=master)
![Test kyber1024-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024-90s/badge.svg?branch=master)
![Test ntruhps2048509](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048509/badge.svg?branch=master)
![Test kyber768-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber768-90s/badge.svg?branch=master)
![Test mceliece6960119f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119f/badge.svg?branch=master)
![Test saber](https://github.com/PQClean/PQClean/workflows/Test%20saber/badge.svg?branch=master)
![Test kyber768](https://github.com/PQClean/PQClean/workflows/Test%20kyber768/badge.svg?branch=master)
![Test ntruhps2048509](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048509/badge.svg?branch=master)
![Test ntruhps4096821](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps4096821/badge.svg?branch=master)
![Test ntrulpr761](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr761/badge.svg?branch=master)
![Test kyber512](https://github.com/PQClean/PQClean/workflows/Test%20kyber512/badge.svg?branch=master)
![Test hqc-rmrs-128](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-128/badge.svg?branch=master)
![Test mceliece6688128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128/badge.svg?branch=master)
![Test lightsaber](https://github.com/PQClean/PQClean/workflows/Test%20lightsaber/badge.svg?branch=master)
![Test mceliece460896f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896f/badge.svg?branch=master)
![Test mceliece8192128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128/badge.svg?branch=master)
![Test mceliece460896](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896/badge.svg?branch=master)
![Test ntruhps2048677](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048677/badge.svg?branch=master)
![Test ntrulpr857](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr857/badge.svg?branch=master)
![Test hqc-rmrs-256](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-256/badge.svg?branch=master)
![Test frodokem976shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976shake/badge.svg?branch=master)
![Test frodokem640shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640shake/badge.svg?branch=master)

+ 204
- 0
.github/workflows/sign_dilithium2aes.yml Переглянути файл

@@ -0,0 +1,204 @@
on:
push:
paths:
# build if tests change
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium2aes*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium2aes/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium2aes.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
pull_request:
paths:
# build if tests change
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium2aes*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium2aes/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium2aes.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
schedule:
- cron: '5 4 * * *'

name: Test dilithium2aes

jobs:
test-native:
runs-on: ubuntu-latest
container:
image: pqclean/ci-container:${{ matrix.arch }}
env:
PQCLEAN_ONLY_SCHEMES: dilithium2aes
CC: ccache ${{ matrix.cc }}
CCACHE_NOSTATS: 1
CCACHE_DIR: /ccache
CCACHE_SLOPPINESS: include_file_mtime
strategy:
matrix:
arch:
- amd64
- i386
cc:
- gcc
- clang
steps:
- name: Cancel Previous Runs
uses: thomwiggers/cancel-workflow-action@all_but_latest
with:
all_but_latest: true
access_token: ${{ github.token }}
continue-on-error: true
if: matrix.arch == 'amd64' && matrix.cc == 'gcc'
- uses: actions/checkout@v2
with:
submodules: true
- name: Cache ccache
uses: actions/cache@v2
env:
cache-name: cache-ccache
with:
path: /ccache
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }}
- name: Cache pip
uses: actions/cache@v2
env:
cache-name: cache-python-pip
with:
path: ~/.cache/pip
key: v1-python-pip
- name: Install python dependencies
run: |
python3 -m pip install -U -r requirements.txt
- name: Run tests
run: |
cd test
python3 -m pytest --verbose --numprocesses=auto
test-emulated:
needs:
- test-native
runs-on: ubuntu-latest
strategy:
matrix:
arch:
- armhf
- unstable-ppc
cc:
- gcc
- clang
env:
CC: ${{ matrix.cc }}
steps:
- name: Register qemu-user-static
run: |
docker run --rm --privileged multiarch/qemu-user-static:register --reset
- uses: actions/checkout@v2
with:
submodules: true
- name: Cache ccache
uses: actions/cache@v2
env:
cache-name: cache-ccache
with:
path: ~/ccache
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }}
- name: Cache pip
uses: actions/cache@v2
env:
cache-name: cache-python-pip
with:
path: ~/.cache/pip
key: v1-python-pip
- name: Run tests in container
run: |
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium2aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\
export CCACHE_NOSTATS=1 && \
export CCACHE_DIR=/ccache && \
export CCACHE_SLOPPINESS=include_file_mtime && \
export CC=\"ccache $CC\" && \
pip3 install -U -r requirements.txt && \
cd test && \
python3 -m pytest --verbose --numprocesses=auto"
test-windows:
needs:
- test-native
strategy:
matrix:
bits:
- 64
- 32
env:
PQCLEAN_ONLY_SCHEMES: dilithium2aes
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Setup astyle
run: |
# Setup strong crypto
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord
Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe"
shell: powershell
- name: Setup Python
uses: actions/setup-python@main
with:
python-version: "3.x"
- name: Install python requirements
run: python -m pip install -U -r requirements.txt
- name: Run tests
run: |
call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat"
cd test
python -m pytest --verbose --numprocesses=auto
shell: cmd
test-macos:
needs:
- test-native
env:
PQCLEAN_ONLY_SCHEMES: dilithium2aes
CCACHE_NOSTATS: 1
CCACHE_SLOPPINESS: include_file_mtime
# XCode version
DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer
strategy:
matrix:
compiler:
- clang # XCode (Apple LLVM/Clang)
- gcc9 # GNU (Homebrew)
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install astyle
run: |
brew install astyle
- name: Set up GCC9 compiler
run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9'
if: matrix.compiler == 'gcc9'
- name: Setup Python
uses: actions/setup-python@main
with:
python-version: "3.x"
- name: Install Python dependencies
run: python -m pip install -U -r requirements.txt
- name: Run tests
run: |
cd test
python -m pytest --verbose --numprocesses=auto


+ 204
- 0
.github/workflows/sign_dilithium3aes.yml Переглянути файл

@@ -0,0 +1,204 @@
on:
push:
paths:
# build if tests change
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium3aes*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium3aes/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium3aes.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
pull_request:
paths:
# build if tests change
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium3aes*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium3aes/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium3aes.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
schedule:
- cron: '5 4 * * *'

name: Test dilithium3aes

jobs:
test-native:
runs-on: ubuntu-latest
container:
image: pqclean/ci-container:${{ matrix.arch }}
env:
PQCLEAN_ONLY_SCHEMES: dilithium3aes
CC: ccache ${{ matrix.cc }}
CCACHE_NOSTATS: 1
CCACHE_DIR: /ccache
CCACHE_SLOPPINESS: include_file_mtime
strategy:
matrix:
arch:
- amd64
- i386
cc:
- gcc
- clang
steps:
- name: Cancel Previous Runs
uses: thomwiggers/cancel-workflow-action@all_but_latest
with:
all_but_latest: true
access_token: ${{ github.token }}
continue-on-error: true
if: matrix.arch == 'amd64' && matrix.cc == 'gcc'
- uses: actions/checkout@v2
with:
submodules: true
- name: Cache ccache
uses: actions/cache@v2
env:
cache-name: cache-ccache
with:
path: /ccache
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }}
- name: Cache pip
uses: actions/cache@v2
env:
cache-name: cache-python-pip
with:
path: ~/.cache/pip
key: v1-python-pip
- name: Install python dependencies
run: |
python3 -m pip install -U -r requirements.txt
- name: Run tests
run: |
cd test
python3 -m pytest --verbose --numprocesses=auto
test-emulated:
needs:
- test-native
runs-on: ubuntu-latest
strategy:
matrix:
arch:
- armhf
- unstable-ppc
cc:
- gcc
- clang
env:
CC: ${{ matrix.cc }}
steps:
- name: Register qemu-user-static
run: |
docker run --rm --privileged multiarch/qemu-user-static:register --reset
- uses: actions/checkout@v2
with:
submodules: true
- name: Cache ccache
uses: actions/cache@v2
env:
cache-name: cache-ccache
with:
path: ~/ccache
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }}
- name: Cache pip
uses: actions/cache@v2
env:
cache-name: cache-python-pip
with:
path: ~/.cache/pip
key: v1-python-pip
- name: Run tests in container
run: |
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium3aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\
export CCACHE_NOSTATS=1 && \
export CCACHE_DIR=/ccache && \
export CCACHE_SLOPPINESS=include_file_mtime && \
export CC=\"ccache $CC\" && \
pip3 install -U -r requirements.txt && \
cd test && \
python3 -m pytest --verbose --numprocesses=auto"
test-windows:
needs:
- test-native
strategy:
matrix:
bits:
- 64
- 32
env:
PQCLEAN_ONLY_SCHEMES: dilithium3aes
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Setup astyle
run: |
# Setup strong crypto
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord
Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe"
shell: powershell
- name: Setup Python
uses: actions/setup-python@main
with:
python-version: "3.x"
- name: Install python requirements
run: python -m pip install -U -r requirements.txt
- name: Run tests
run: |
call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat"
cd test
python -m pytest --verbose --numprocesses=auto
shell: cmd
test-macos:
needs:
- test-native
env:
PQCLEAN_ONLY_SCHEMES: dilithium3aes
CCACHE_NOSTATS: 1
CCACHE_SLOPPINESS: include_file_mtime
# XCode version
DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer
strategy:
matrix:
compiler:
- clang # XCode (Apple LLVM/Clang)
- gcc9 # GNU (Homebrew)
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install astyle
run: |
brew install astyle
- name: Set up GCC9 compiler
run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9'
if: matrix.compiler == 'gcc9'
- name: Setup Python
uses: actions/setup-python@main
with:
python-version: "3.x"
- name: Install Python dependencies
run: python -m pip install -U -r requirements.txt
- name: Run tests
run: |
cd test
python -m pytest --verbose --numprocesses=auto


.github/workflows/sign_dilithium4.yml → .github/workflows/sign_dilithium5.yml Переглянути файл

@@ -5,13 +5,13 @@ on:
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium4*.yml'
- 'test/duplicate_consistency/dilithium5*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium4/**'
- 'crypto_sign/dilithium5/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium4.yml'
- '.github/workflows/sign_dilithium5.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
@@ -21,20 +21,20 @@ on:
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium4*.yml'
- 'test/duplicate_consistency/dilithium5*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium4/**'
- 'crypto_sign/dilithium5/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium4.yml'
- '.github/workflows/sign_dilithium5.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
schedule:
- cron: '5 4 * * *'

name: Test dilithium4
name: Test dilithium5

jobs:
test-native:
@@ -42,7 +42,7 @@ jobs:
container:
image: pqclean/ci-container:${{ matrix.arch }}
env:
PQCLEAN_ONLY_SCHEMES: dilithium4
PQCLEAN_ONLY_SCHEMES: dilithium5
CC: ccache ${{ matrix.cc }}
CCACHE_NOSTATS: 1
CCACHE_DIR: /ccache
@@ -124,7 +124,7 @@ jobs:
key: v1-python-pip
- name: Run tests in container
run: |
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium4 -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium5 -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\
export CCACHE_NOSTATS=1 && \
export CCACHE_DIR=/ccache && \
export CCACHE_SLOPPINESS=include_file_mtime && \
@@ -141,7 +141,7 @@ jobs:
- 64
- 32
env:
PQCLEAN_ONLY_SCHEMES: dilithium4
PQCLEAN_ONLY_SCHEMES: dilithium5
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
@@ -170,7 +170,7 @@ jobs:
needs:
- test-native
env:
PQCLEAN_ONLY_SCHEMES: dilithium4
PQCLEAN_ONLY_SCHEMES: dilithium5
CCACHE_NOSTATS: 1
CCACHE_SLOPPINESS: include_file_mtime
# XCode version

+ 204
- 0
.github/workflows/sign_dilithium5aes.yml Переглянути файл

@@ -0,0 +1,204 @@
on:
push:
paths:
# build if tests change
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium5aes*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium5aes/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium5aes.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
pull_request:
paths:
# build if tests change
- 'test/**'
# do not build if other schemes duplicate_consistency files change
- '!test/duplicate_consistency/*.yml'
- 'test/duplicate_consistency/dilithium5aes*.yml'
# build if common files change
- 'common/**'
# build if scheme changed
- 'crypto_sign/dilithium5aes/**'
# build if workflow file changed
- '.github/workflows/sign_dilithium5aes.yml'
# Build if any files in the root change, except .md files
- '*'
- '!*.md'
schedule:
- cron: '5 4 * * *'

name: Test dilithium5aes

jobs:
test-native:
runs-on: ubuntu-latest
container:
image: pqclean/ci-container:${{ matrix.arch }}
env:
PQCLEAN_ONLY_SCHEMES: dilithium5aes
CC: ccache ${{ matrix.cc }}
CCACHE_NOSTATS: 1
CCACHE_DIR: /ccache
CCACHE_SLOPPINESS: include_file_mtime
strategy:
matrix:
arch:
- amd64
- i386
cc:
- gcc
- clang
steps:
- name: Cancel Previous Runs
uses: thomwiggers/cancel-workflow-action@all_but_latest
with:
all_but_latest: true
access_token: ${{ github.token }}
continue-on-error: true
if: matrix.arch == 'amd64' && matrix.cc == 'gcc'
- uses: actions/checkout@v2
with:
submodules: true
- name: Cache ccache
uses: actions/cache@v2
env:
cache-name: cache-ccache
with:
path: /ccache
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }}
- name: Cache pip
uses: actions/cache@v2
env:
cache-name: cache-python-pip
with:
path: ~/.cache/pip
key: v1-python-pip
- name: Install python dependencies
run: |
python3 -m pip install -U -r requirements.txt
- name: Run tests
run: |
cd test
python3 -m pytest --verbose --numprocesses=auto
test-emulated:
needs:
- test-native
runs-on: ubuntu-latest
strategy:
matrix:
arch:
- armhf
- unstable-ppc
cc:
- gcc
- clang
env:
CC: ${{ matrix.cc }}
steps:
- name: Register qemu-user-static
run: |
docker run --rm --privileged multiarch/qemu-user-static:register --reset
- uses: actions/checkout@v2
with:
submodules: true
- name: Cache ccache
uses: actions/cache@v2
env:
cache-name: cache-ccache
with:
path: ~/ccache
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }}
- name: Cache pip
uses: actions/cache@v2
env:
cache-name: cache-python-pip
with:
path: ~/.cache/pip
key: v1-python-pip
- name: Run tests in container
run: |
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium5aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\
export CCACHE_NOSTATS=1 && \
export CCACHE_DIR=/ccache && \
export CCACHE_SLOPPINESS=include_file_mtime && \
export CC=\"ccache $CC\" && \
pip3 install -U -r requirements.txt && \
cd test && \
python3 -m pytest --verbose --numprocesses=auto"
test-windows:
needs:
- test-native
strategy:
matrix:
bits:
- 64
- 32
env:
PQCLEAN_ONLY_SCHEMES: dilithium5aes
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Setup astyle
run: |
# Setup strong crypto
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord
Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe"
shell: powershell
- name: Setup Python
uses: actions/setup-python@main
with:
python-version: "3.x"
- name: Install python requirements
run: python -m pip install -U -r requirements.txt
- name: Run tests
run: |
call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat"
cd test
python -m pytest --verbose --numprocesses=auto
shell: cmd
test-macos:
needs:
- test-native
env:
PQCLEAN_ONLY_SCHEMES: dilithium5aes
CCACHE_NOSTATS: 1
CCACHE_SLOPPINESS: include_file_mtime
# XCode version
DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer
strategy:
matrix:
compiler:
- clang # XCode (Apple LLVM/Clang)
- gcc9 # GNU (Homebrew)
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install astyle
run: |
brew install astyle
- name: Set up GCC9 compiler
run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9'
if: matrix.compiler == 'gcc9'
- name: Setup Python
uses: actions/setup-python@main
with:
python-version: "3.x"
- name: Install Python dependencies
run: python -m pip install -U -r requirements.txt
- name: Run tests
run: |
cd test
python -m pytest --verbose --numprocesses=auto



+ 16
- 16
crypto_sign/dilithium2/META.yml Переглянути файл

@@ -1,11 +1,11 @@
name: Dilithium2
type: signature
claimed-nist-level: 1
length-public-key: 1184
length-secret-key: 2800
length-signature: 2044
nistkat-sha256: 23b7d52a268bbd8633d139b64a1b0e3263777cb2b074f7af0a7fd315afe94d18
testvectors-sha256: d647039ae7e1785414c64934d5ae37518f259acab95d6a6e873e9b6d3ad63dfd
claimed-nist-level: 2
length-public-key: 1312
length-secret-key: 2544
length-signature: 2420
nistkat-sha256: 9c636528bf81c03df6ad8f9471cb1b4d9097d66af825d4f60b7ff0d941ca4d37
testvectors-sha256: 166fc2481358d5a1b7a528b30af36ad069b049b5755cf63b843ce0f25f35aeb6
principal-submitters:
- Vadim Lyubashevsky
auxiliary-submitters:
@@ -17,15 +17,15 @@ auxiliary-submitters:
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
- name: avx2
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
supported_platforms:
- architecture: x86_64
operating_systems:
- Darwin
- Linux
required_flags:
- avx2
- bmi1
- popcnt
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- aes
- avx2
- popcnt

+ 3
- 4
crypto_sign/dilithium2/avx2/LICENSE Переглянути файл

@@ -1,6 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and the random number generator
we are using public-domain code from sources
and by authors listed in comments on top of
the respective files.
For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 11
- 20
crypto_sign/dilithium2/avx2/Makefile Переглянути файл

@@ -1,34 +1,27 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libdilithium2_avx2.a

SOURCES = fips202x4.c invntt.S nttconsts.c ntt.S packing.c pointwise.S poly.c \
polyvec.c reduce.S rejsample.c rounding.c sign.c stream.c
OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \
polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o
HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \
nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \
fips202x4.h shuffle.inc cdecl.inc

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \
-Wmissing-prototypes -Wredundant-decls -std=c99 \
-Wcast-align -Werror=shadow\
-mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS)

all: $(LIB)

HEADERS=align.h api.h cdecl.h consts.h fips202x4.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc
OBJECTS=consts.o fips202x4.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o symmetric-shake.o f1600x4.o invntt.o ntt.o pointwise.o shuffle.o
KECCAK4XDIR=../../../common/keccak4x
KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o
KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)

CFLAGS=-mavx2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \
-Wmissing-prototypes -Wredundant-decls \
-Wpointer-arith -Wshadow \
-std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.S $(HEADERS)
$(CC) -c -o $@ $<
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $^
$(AR) -r $@ $(OBJECTS) $(KECCAK4X)

$(KECCAK4X):
$(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ)
@@ -36,5 +29,3 @@ $(KECCAK4X):
clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)
$(MAKE) -C $(KECCAK4XDIR) clean


+ 19
- 0
crypto_sign/dilithium2/avx2/align.h Переглянути файл

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGN_H
#define PQCLEAN_DILITHIUM2_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[N]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGNED_INT32(N) \
union { \
int32_t coeffs[N]; \
__m256i vec[((N)+7)/8]; \
}

#endif

+ 0
- 22
crypto_sign/dilithium2/avx2/alignment.h Переглянути файл

@@ -1,22 +0,0 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H
#define PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H

#define ALIGNED_UINT8(N) \
union { \
uint32_t as_arr[N]; \
__m256i as_vec[(N)/32]; \
}

#define ALIGNED_UINT32(N) \
union { \
uint32_t as_arr[N]; \
__m256i as_vec[(N)/8]; \
}

#define ALIGNED_UINT64(N) \
union { \
uint64_t as_arr[N]; \
__m256i as_vec[(N)/8]; \
}

#endif //PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H

+ 10
- 17
crypto_sign/dilithium2/avx2/api.h Переглянути файл

@@ -4,26 +4,13 @@
#include <stddef.h>
#include <stdint.h>


#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1184U
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2800U
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2044U

#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1312
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2544
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2420
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2"


int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
@@ -33,6 +20,12 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

crypto_sign/dilithium4/avx2/cdecl.inc → crypto_sign/dilithium2/avx2/cdecl.h Переглянути файл

@@ -1,5 +1,14 @@
#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL
#define PQCLEAN_DILITHIUM4_AVX2_CDECL
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL_H
#define PQCLEAN_DILITHIUM2_AVX2_CDECL_H



#define _8XQ 0
#define _8XQINV 8
#define _8XDIV_QINV 16
#define _8XDIV 24
#define _ZETAS_QINV 32
#define _ZETAS 328

/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
@@ -9,10 +18,7 @@
* This define helps us get around this
*/

#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define _cdecl(s) _##s
#define cdecl(s) s
#endif

#endif

+ 101
- 0
crypto_sign/dilithium2/avx2/consts.c Переглянути файл

@@ -0,0 +1,101 @@
#include "consts.h"
#include "params.h"
#include <stdint.h>

#define QINV 58728449 // q^(-1) mod 2^32
#define MONT (-4186625) // 2^32 mod q
#define DIV 41978 // mont^2/256
#define DIV_QINV (-8395782)

const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata = {{
//#define _8XQ 0
Q, Q, Q, Q, Q, Q, Q, Q,

//#define _8XQINV 8
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

//#define _8XDIV_QINV 16
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV,

//#define _8XDIV 24
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV,

//#define _ZETAS_QINV 32
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244,
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077,
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561,
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417,
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735,
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904,
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771,
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600,
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139,
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433,
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547,
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852,
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995,
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424,
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315,
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951,
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031,
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878,
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606,
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568,
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583,
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093,
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172,
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187,
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462,
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722,
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279,
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342,
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272,
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682,
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363,
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473,
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426,
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762,
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494,
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853,
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238,

//#define _ZETAS 328
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451,
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905,
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855,
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103,
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928,
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549,
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672,
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928,
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771,
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969,
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969,
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922,
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430,
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856,
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961,
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995,
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100,
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149,
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738,
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098,
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455,
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634,
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424,
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622,
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115,
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233,
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154,
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838,
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642,
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107,
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782,
}
};

+ 10
- 0
crypto_sign/dilithium2/avx2/consts.h Переглянути файл

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_CONSTS_H
#define PQCLEAN_DILITHIUM2_AVX2_CONSTS_H
#include "align.h"
#include "cdecl.h"


typedef ALIGNED_INT32(624) qdata_t;
extern const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata;

#endif

+ 909
- 0
crypto_sign/dilithium2/avx2/f1600x4.S Переглянути файл

@@ -0,0 +1,909 @@
/* Taken from Bas Westerbaan's new 4-way SHAKE implementation
* for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/),
* but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */

#include "cdecl.h"

.data
.p2align 5
rho8:
.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14
rho56:
.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8

.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4)
cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4):
vmovdqa rho8(%rip), %ymm0
movq $6, %rax
looptop:
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 192(%rdi), %ymm4, %ymm9
vpxor 384(%rdi), %ymm3, %ymm10
vpxor 576(%rdi), %ymm2, %ymm11
vpxor 768(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 0(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 96(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 320(%rdi), %ymm5, %ymm10
vpxor 512(%rdi), %ymm4, %ymm11
vpxor 704(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 32(%rdi), %ymm4, %ymm8
vpxor 224(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 608(%rdi), %ymm1, %ymm11
vpxor 640(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 128(%rdi), %ymm1, %ymm8
vpxor 160(%rdi), %ymm5, %ymm9
vpxor 352(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 736(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 64(%rdi), %ymm3, %ymm8
vpxor 256(%rdi), %ymm2, %ymm9
vpxor 448(%rdi), %ymm1, %ymm10
vpxor 480(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 448(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 512(%rdi), %ymm4, %ymm9
vpxor 224(%rdi), %ymm3, %ymm10
vpxor 736(%rdi), %ymm2, %ymm11
vpxor 448(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 8(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 576(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 640(%rdi), %ymm5, %ymm10
vpxor 352(%rdi), %ymm4, %ymm11
vpxor 64(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 192(%rdi), %ymm4, %ymm8
vpxor 704(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 128(%rdi), %ymm1, %ymm11
vpxor 480(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 768(%rdi), %ymm1, %ymm8
vpxor 320(%rdi), %ymm5, %ymm9
vpxor 32(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 256(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 384(%rdi), %ymm3, %ymm8
vpxor 96(%rdi), %ymm2, %ymm9
vpxor 608(%rdi), %ymm1, %ymm10
vpxor 160(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 608(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 352(%rdi), %ymm4, %ymm9
vpxor 704(%rdi), %ymm3, %ymm10
vpxor 256(%rdi), %ymm2, %ymm11
vpxor 608(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 16(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 736(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 480(%rdi), %ymm5, %ymm10
vpxor 32(%rdi), %ymm4, %ymm11
vpxor 384(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 512(%rdi), %ymm4, %ymm8
vpxor 64(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 768(%rdi), %ymm1, %ymm11
vpxor 160(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 768(%rdi)
vpxor 448(%rdi), %ymm1, %ymm8
vpxor 640(%rdi), %ymm5, %ymm9
vpxor 192(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 96(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 224(%rdi), %ymm3, %ymm8
vpxor 576(%rdi), %ymm2, %ymm9
vpxor 128(%rdi), %ymm1, %ymm10
vpxor 320(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 128(%rdi)
vmovdqa 0(%rdi), %ymm8
vmovdqa 32(%rdi), %ymm9
vmovdqa 64(%rdi), %ymm10
vmovdqa 96(%rdi), %ymm11
vmovdqa 128(%rdi), %ymm12
vpxor 160(%rdi), %ymm8, %ymm8
vpxor 192(%rdi), %ymm9, %ymm9
vpxor 224(%rdi), %ymm10, %ymm10
vpxor 256(%rdi), %ymm11, %ymm11
vpxor 288(%rdi), %ymm12, %ymm12
vpxor 320(%rdi), %ymm8, %ymm8
vpxor 352(%rdi), %ymm9, %ymm9
vpxor 384(%rdi), %ymm10, %ymm10
vpxor 416(%rdi), %ymm11, %ymm11
vpxor 448(%rdi), %ymm12, %ymm12
vpxor 480(%rdi), %ymm8, %ymm8
vpxor 512(%rdi), %ymm9, %ymm9
vpxor 544(%rdi), %ymm10, %ymm10
vpxor 576(%rdi), %ymm11, %ymm11
vpxor 608(%rdi), %ymm12, %ymm12
vpxor 640(%rdi), %ymm8, %ymm8
vpxor 672(%rdi), %ymm9, %ymm9
vpxor 704(%rdi), %ymm10, %ymm10
vpxor 736(%rdi), %ymm11, %ymm11
vpxor 768(%rdi), %ymm12, %ymm12
vpsllq $1, %ymm9, %ymm13
vpsllq $1, %ymm10, %ymm14
vpsllq $1, %ymm11, %ymm15
vpsllq $1, %ymm12, %ymm7
vpsllq $1, %ymm8, %ymm6
vpsrlq $63, %ymm9, %ymm5
vpsrlq $63, %ymm10, %ymm4
vpsrlq $63, %ymm11, %ymm3
vpsrlq $63, %ymm12, %ymm2
vpsrlq $63, %ymm8, %ymm1
vpor %ymm13, %ymm5, %ymm5
vpor %ymm14, %ymm4, %ymm4
vpor %ymm15, %ymm3, %ymm3
vpor %ymm7, %ymm2, %ymm2
vpor %ymm6, %ymm1, %ymm1
vpxor %ymm5, %ymm12, %ymm5
vpxor %ymm4, %ymm8, %ymm4
vpxor %ymm3, %ymm9, %ymm3
vpxor %ymm2, %ymm10, %ymm2
vpxor %ymm1, %ymm11, %ymm1
vpxor 0(%rdi), %ymm5, %ymm8
vpxor 32(%rdi), %ymm4, %ymm9
vpxor 64(%rdi), %ymm3, %ymm10
vpxor 96(%rdi), %ymm2, %ymm11
vpxor 128(%rdi), %ymm1, %ymm12
vpsllq $44, %ymm9, %ymm14
vpsllq $43, %ymm10, %ymm15
vpsllq $21, %ymm11, %ymm7
vpsllq $14, %ymm12, %ymm6
vpsrlq $20, %ymm9, %ymm9
vpsrlq $21, %ymm10, %ymm10
vpsrlq $43, %ymm11, %ymm11
vpsrlq $50, %ymm12, %ymm12
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vpbroadcastq 24(%rsi), %ymm8
vpxor %ymm8, %ymm13, %ymm13
vmovdqa %ymm13, 0(%rdi)
vmovdqa %ymm14, 32(%rdi)
vmovdqa %ymm15, 64(%rdi)
vmovdqa %ymm7, 96(%rdi)
vmovdqa %ymm6, 128(%rdi)
vpxor 256(%rdi), %ymm2, %ymm8
vpxor 288(%rdi), %ymm1, %ymm9
vpxor 160(%rdi), %ymm5, %ymm10
vpxor 192(%rdi), %ymm4, %ymm11
vpxor 224(%rdi), %ymm3, %ymm12
vpsllq $28, %ymm8, %ymm13
vpsllq $20, %ymm9, %ymm14
vpsllq $3, %ymm10, %ymm15
vpsllq $45, %ymm11, %ymm7
vpsllq $61, %ymm12, %ymm6
vpsrlq $36, %ymm8, %ymm8
vpsrlq $44, %ymm9, %ymm9
vpsrlq $61, %ymm10, %ymm10
vpsrlq $19, %ymm11, %ymm11
vpsrlq $3, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 160(%rdi)
vmovdqa %ymm14, 192(%rdi)
vmovdqa %ymm15, 224(%rdi)
vmovdqa %ymm7, 256(%rdi)
vmovdqa %ymm6, 288(%rdi)
vpxor 352(%rdi), %ymm4, %ymm8
vpxor 384(%rdi), %ymm3, %ymm9
vpxor 416(%rdi), %ymm2, %ymm10
vpxor 448(%rdi), %ymm1, %ymm11
vpxor 320(%rdi), %ymm5, %ymm12
vpsllq $1, %ymm8, %ymm13
vpsllq $6, %ymm9, %ymm14
vpsllq $25, %ymm10, %ymm15
#vpsllq $8, %ymm11, %ymm7
vpsllq $18, %ymm12, %ymm6
vpsrlq $63, %ymm8, %ymm8
vpsrlq $58, %ymm9, %ymm9
vpsrlq $39, %ymm10, %ymm10
#vpsrlq $56, %ymm11, %ymm11
vpsrlq $46, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
#vpor %ymm7, %ymm11, %ymm11
vpshufb %ymm0, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 320(%rdi)
vmovdqa %ymm14, 352(%rdi)
vmovdqa %ymm15, 384(%rdi)
vmovdqa %ymm7, 416(%rdi)
vmovdqa %ymm6, 448(%rdi)
vpxor 608(%rdi), %ymm1, %ymm8
vpxor 480(%rdi), %ymm5, %ymm9
vpxor 512(%rdi), %ymm4, %ymm10
vpxor 544(%rdi), %ymm3, %ymm11
vpxor 576(%rdi), %ymm2, %ymm12
vpsllq $27, %ymm8, %ymm13
vpsllq $36, %ymm9, %ymm14
vpsllq $10, %ymm10, %ymm15
vpsllq $15, %ymm11, %ymm7
#vpsllq $56, %ymm12, %ymm6
vpsrlq $37, %ymm8, %ymm8
vpsrlq $28, %ymm9, %ymm9
vpsrlq $54, %ymm10, %ymm10
vpsrlq $49, %ymm11, %ymm11
#vpsrlq $8, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
#vpor %ymm6, %ymm12, %ymm12
vpshufb rho56(%rip), %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 480(%rdi)
vmovdqa %ymm14, 512(%rdi)
vmovdqa %ymm15, 544(%rdi)
vmovdqa %ymm7, 576(%rdi)
vmovdqa %ymm6, 608(%rdi)
vpxor 704(%rdi), %ymm3, %ymm8
vpxor 736(%rdi), %ymm2, %ymm9
vpxor 768(%rdi), %ymm1, %ymm10
vpxor 640(%rdi), %ymm5, %ymm11
vpxor 672(%rdi), %ymm4, %ymm12
vpsllq $62, %ymm8, %ymm13
vpsllq $55, %ymm9, %ymm14
vpsllq $39, %ymm10, %ymm15
vpsllq $41, %ymm11, %ymm7
vpsllq $2, %ymm12, %ymm6
vpsrlq $2, %ymm8, %ymm8
vpsrlq $9, %ymm9, %ymm9
vpsrlq $25, %ymm10, %ymm10
vpsrlq $23, %ymm11, %ymm11
vpsrlq $62, %ymm12, %ymm12
vpor %ymm13, %ymm8, %ymm8
vpor %ymm14, %ymm9, %ymm9
vpor %ymm15, %ymm10, %ymm10
vpor %ymm7, %ymm11, %ymm11
vpor %ymm6, %ymm12, %ymm12
vpandn %ymm10, %ymm9, %ymm13
vpandn %ymm11, %ymm10, %ymm14
vpandn %ymm12, %ymm11, %ymm15
vpandn %ymm8, %ymm12, %ymm7
vpandn %ymm9, %ymm8, %ymm6
vpxor %ymm8, %ymm13, %ymm13
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm10, %ymm15, %ymm15
vpxor %ymm11, %ymm7, %ymm7
vpxor %ymm12, %ymm6, %ymm6
vmovdqa %ymm13, 640(%rdi)
vmovdqa %ymm14, 672(%rdi)
vmovdqa %ymm15, 704(%rdi)
vmovdqa %ymm7, 736(%rdi)
vmovdqa %ymm6, 768(%rdi)
addq $32, %rsi
subq $1, %rax
jnz looptop
ret

+ 173
- 187
crypto_sign/dilithium2/avx2/fips202x4.c Переглянути файл

@@ -1,233 +1,219 @@
#include <immintrin.h>
#include <stdint.h>

#include "fips202.h"
#include "fips202x4.h"
#include "params.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#define NROUNDS 24
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))

static uint64_t load64(const uint8_t *x) {
uint64_t r = 0;

for (size_t i = 0; i < 8; ++i) {
r |= (uint64_t)x[i] << 8 * i;
}

return r;
}

static void store64(uint8_t *x, uint64_t u) {
for (size_t i = 0; i < 8; ++i) {
x[i] = (uint8_t)(u >> 8 * i);
}
}

/* Use implementation from the Keccak Code Package */
extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds

static void keccak_absorb4x(__m256i *s,
uint8_t r,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen,
uint8_t p) {
/* Keccak round constants */
static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
(uint64_t)0x0000000000000001ULL,
(uint64_t)0x0000000000008082ULL,
(uint64_t)0x800000000000808aULL,
(uint64_t)0x8000000080008000ULL,
(uint64_t)0x000000000000808bULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008009ULL,
(uint64_t)0x000000000000008aULL,
(uint64_t)0x0000000000000088ULL,
(uint64_t)0x0000000080008009ULL,
(uint64_t)0x000000008000000aULL,
(uint64_t)0x000000008000808bULL,
(uint64_t)0x800000000000008bULL,
(uint64_t)0x8000000000008089ULL,
(uint64_t)0x8000000000008003ULL,
(uint64_t)0x8000000000008002ULL,
(uint64_t)0x8000000000000080ULL,
(uint64_t)0x000000000000800aULL,
(uint64_t)0x800000008000000aULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008080ULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008008ULL
};

static void keccakx4_absorb_once(__m256i s[25],
unsigned int r,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen,
uint8_t p) {
size_t i;
uint8_t t0[200];
uint8_t t1[200];
uint8_t t2[200];
uint8_t t3[200];
uint64_t *ss = (uint64_t *)s;
uint64_t pos = 0;
__m256i t, idx;

for (i = 0; i < 25; ++i) {
s[i] = _mm256_xor_si256(s[i], s[i]);
s[i] = _mm256_setzero_si256();
}

while (mlen >= r) {
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
while (inlen >= r) {
for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(m0 + 8 * i);
ss[4 * i + 1] ^= load64(m1 + 8 * i);
ss[4 * i + 2] ^= load64(m2 + 8 * i);
ss[4 * i + 3] ^= load64(m3 + 8 * i);
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= r;

KeccakF1600_StatePermute4x(s);
mlen -= r;
m0 += r;
m1 += r;
m2 += r;
m3 += r;
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants);
}

for (i = 0; i < r; ++i) {
t0[i] = 0;
t1[i] = 0;
t2[i] = 0;
t3[i] = 0;
}
for (i = 0; i < mlen; ++i) {
t0[i] = m0[i];
t1[i] = m1[i];
t2[i] = m2[i];
t3[i] = m3[i];
for (i = 0; i < inlen / 8; ++i) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= 8 * i;

t0[i] = p;
t1[i] = p;
t2[i] = p;
t3[i] = p;

t0[r - 1] |= 128;
t1[r - 1] |= 128;
t2[r - 1] |= 128;
t3[r - 1] |= 128;

for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(t0 + 8 * i);
ss[4 * i + 1] ^= load64(t1 + 8 * i);
ss[4 * i + 2] ^= load64(t2 + 8 * i);
ss[4 * i + 3] ^= load64(t3 + 8 * i);
if (inlen) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1));
t = _mm256_and_si256(t, idx);
s[i] = _mm256_xor_si256(s[i], t);
}
}

t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen);
s[i] = _mm256_xor_si256(s[i], t);
t = _mm256_set1_epi64x((long long)(1ULL << 63));
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t);
}

static void keccak_squeezeblocks4x(uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
static void keccakx4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
uint8_t r,
__m256i *s) {
uint64_t *ss = (uint64_t *)s;
unsigned int r,
__m256i s[25]) {
unsigned int i;
__m128d t;

while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (size_t i = 0; i < r / 8; ++i) {
store64(h0 + 8 * i, ss[4 * i + 0]);
store64(h1 + 8 * i, ss[4 * i + 1]);
store64(h2 + 8 * i, ss[4 * i + 2]);
store64(h3 + 8 * i, ss[4 * i + 3]);
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
}

h0 += r;
h1 += r;
h2 += r;
h3 += r;
out0 += r;
out1 += r;
out2 += r;
out3 += r;
--nblocks;
}

}

void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen) {
keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F);
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t nblocks,
__m256i *s) {
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s);
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
}

void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen) {
keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F);
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t nblocks,
__m256i *s) {
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s);
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
}

void PQCLEAN_DILITHIUM2_AVX2_shake128_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen) {
size_t nblocks = hlen / SHAKE128_RATE;
void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
unsigned int i;
size_t nblocks = outlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
__m256i s[25];
PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen);
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s);
h0 += nblocks * SHAKE128_RATE;
h1 += nblocks * SHAKE128_RATE;
h2 += nblocks * SHAKE128_RATE;
h3 += nblocks * SHAKE128_RATE;
hlen -= nblocks * SHAKE128_RATE;
if (hlen) {
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s);
for (size_t i = 0; i < hlen; ++i) {
h0[i] = t[0][i];
h1[i] = t[1][i];
h2[i] = t[2][i];
h3[i] = t[3][i];
keccakx4_state state;
PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
out0 += nblocks * SHAKE128_RATE;
out1 += nblocks * SHAKE128_RATE;
out2 += nblocks * SHAKE128_RATE;
out3 += nblocks * SHAKE128_RATE;
outlen -= nblocks * SHAKE128_RATE;
if (outlen) {
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

void PQCLEAN_DILITHIUM2_AVX2_shake256_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen) {
size_t nblocks = hlen / SHAKE256_RATE;
void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
unsigned int i;
size_t nblocks = outlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
__m256i s[25];
PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen);
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s);
h0 += nblocks * SHAKE256_RATE;
h1 += nblocks * SHAKE256_RATE;
h2 += nblocks * SHAKE256_RATE;
h3 += nblocks * SHAKE256_RATE;
hlen -= nblocks * SHAKE256_RATE;
if (hlen) {
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s);
for (size_t i = 0; i < hlen; ++i) {
h0[i] = t[0][i];
h1[i] = t[1][i];
h2[i] = t[2][i];
h3[i] = t[3][i];
keccakx4_state state;
PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
out0 += nblocks * SHAKE256_RATE;
out1 += nblocks * SHAKE256_RATE;
out2 += nblocks * SHAKE256_RATE;
out3 += nblocks * SHAKE256_RATE;
outlen -= nblocks * SHAKE256_RATE;
if (outlen) {
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

+ 49
- 51
crypto_sign/dilithium2/avx2/fips202x4.h Переглянути файл

@@ -5,62 +5,60 @@
#include <stddef.h>
#include <stdint.h>

#include "params.h"
typedef struct {
__m256i s[25];
} keccakx4_state;

void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen);
void PQCLEAN_DILITHIUM2_AVX2_f1600x4(__m256i *s, const uint64_t *rc);

void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t nblocks,
__m256i *s);
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen);
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t nblocks,
__m256i *s);
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM2_AVX2_shake128_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen);
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_DILITHIUM2_AVX2_shake256_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
size_t hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen);
void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen);

#endif

+ 233
- 275
crypto_sign/dilithium2/avx2/invntt.S Переглянути файл

@@ -1,282 +1,240 @@
#include "cdecl.h"
.include "shuffle.inc"
#include "cdecl.inc"

.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
vpaddd %ymm2,%ymm\l0,%ymm12
vpaddd %ymm2,%ymm\l1,%ymm13
vpaddd %ymm2,%ymm\l2,%ymm14

vpsubd %ymm\h0,%ymm12,%ymm12
vpsubd %ymm\h1,%ymm13,%ymm13
vpsubd %ymm\h2,%ymm14,%ymm14

vpmuludq %ymm\z0,%ymm12,%ymm12
vpmuludq %ymm\z0,%ymm13,%ymm13
vpaddd %ymm2,%ymm\l3,%ymm15

vpmuludq %ymm\z1,%ymm14,%ymm14
vpsubd %ymm\h3,%ymm15,%ymm15
vpaddd %ymm\l0,%ymm\h0,%ymm\l0

vpmuludq %ymm\z1,%ymm15,%ymm15
vpaddd %ymm\l1,%ymm\h1,%ymm\l1
vpaddd %ymm\l2,%ymm\h2,%ymm\l2

vpaddd %ymm\l3,%ymm\h3,%ymm\l3

vpmuludq %ymm0,%ymm12,%ymm\h0
vpmuludq %ymm0,%ymm13,%ymm\h1
vpmuludq %ymm0,%ymm14,%ymm\h2
vpmuludq %ymm0,%ymm15,%ymm\h3
vpmuludq %ymm1,%ymm\h0,%ymm\h0
vpmuludq %ymm1,%ymm\h1,%ymm\h1
vpmuludq %ymm1,%ymm\h2,%ymm\h2
vpmuludq %ymm1,%ymm\h3,%ymm\h3
vpaddq %ymm12,%ymm\h0,%ymm\h0
vpaddq %ymm13,%ymm\h1,%ymm\h1
vpaddq %ymm14,%ymm\h2,%ymm\h2
vpaddq %ymm15,%ymm\h3,%ymm\h3
vpsrlq $32,%ymm\h0,%ymm\h0
vpsrlq $32,%ymm\h1,%ymm\h1
vpsrlq $32,%ymm\h2,%ymm\h2
vpsrlq $32,%ymm\h3,%ymm\h3

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpsubd %ymm\l,%ymm\h,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vpmuldq %ymm\zl0,%ymm12,%ymm13
vmovshdup %ymm12,%ymm\h
vpmuldq %ymm\zl1,%ymm\h,%ymm14

vpmuldq %ymm\zh0,%ymm12,%ymm12
vpmuldq %ymm\zh1,%ymm\h,%ymm\h

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vpsubd %ymm13,%ymm12,%ymm12
vpsubd %ymm14,%ymm\h,%ymm\h

vmovshdup %ymm12,%ymm12
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h
.endm

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm6
vmovdqa 32(%rsi),%ymm7
vmovdqa 64(%rsi),%ymm5
vmovdqa 96(%rsi),%ymm10

#reorder
shuffle8 6,5,8,5
shuffle8 7,10,6,10

shuffle4 8,6,4,6
shuffle4 5,10,8,10

vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vpsrlq $32,%ymm10,%ymm11

level0:
vpmovzxdq (%rdx),%ymm3
vpmovzxdq 16(%rdx),%ymm15
vpaddd %ymm2,%ymm4,%ymm12
vpaddd %ymm2,%ymm6,%ymm13
vpaddd %ymm2,%ymm8,%ymm14

vpsubd %ymm5,%ymm12,%ymm12
vpsubd %ymm7,%ymm13,%ymm13
vpsubd %ymm9,%ymm14,%ymm14

vpmuludq %ymm3,%ymm12,%ymm12
vpmuludq %ymm15,%ymm13,%ymm13
vpaddd %ymm2,%ymm10,%ymm15

vpsubd %ymm11,%ymm15,%ymm15
vpaddd %ymm4,%ymm5,%ymm4
vpaddd %ymm6,%ymm7,%ymm6
vpmovzxdq 32(%rdx),%ymm5
vpmovzxdq 48(%rdx),%ymm7

vpmuludq %ymm5,%ymm14,%ymm14
vpmuludq %ymm7,%ymm15,%ymm15
vpaddd %ymm8,%ymm9,%ymm8

vpaddd %ymm10,%ymm11,%ymm10

vpmuludq %ymm0,%ymm12,%ymm5
vpmuludq %ymm0,%ymm13,%ymm7
vpmuludq %ymm0,%ymm14,%ymm9
vpmuludq %ymm0,%ymm15,%ymm11
vpmuludq %ymm1,%ymm5,%ymm5
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm9,%ymm9
vpmuludq %ymm1,%ymm11,%ymm11
vpaddq %ymm12,%ymm5,%ymm5
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm9,%ymm9
vpaddq %ymm15,%ymm11,%ymm11
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm7,%ymm7
vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11

level1:
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpmovzxdq 64(%rdx),%ymm15
vpmovzxdq 80(%rdx),%ymm3

butterfly 4,5,8,9,6,7,10,11

level2:
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpmovzxdq 96(%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11,3,3

#shuffle
shuffle4 4,5,3,5
shuffle4 6,7,4,7
shuffle4 8,9,6,9
shuffle4 10,11,8,11

level3:
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 112(%rdx),%ymm14
vpbroadcastd 116(%rdx),%ymm15
vpblendd $0xF0,%ymm15,%ymm14,%ymm10

butterfly 3,4,6,8,5,7,9,11,10,10

#shuffle
shuffle8 3,4,10,4
shuffle8 6,8,3,8
shuffle8 5,7,6,7
shuffle8 9,11,5,11

level4:
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 120(%rdx),%ymm9

butterfly 10,3,6,5,4,8,7,11,9,9

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm5,96(%rdi)
vmovdqa %ymm4,128(%rdi)
vmovdqa %ymm8,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)
.macro levels0t5 off
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

/* level 0 */
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,5,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 6,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,9,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 10,11,1,3,2,15

/* level 1 */
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,6,1,3,2,15
butterfly 5,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,10,1,3,2,15
butterfly 9,11,1,3,2,15

/* level 2 */
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,8,1,3,2,15
butterfly 5,9,1,3,2,15
butterfly 6,10,1,3,2,15
butterfly 7,11,1,3,2,15

/* level 3 */
shuffle2 4,5,3,5
shuffle2 6,7,4,7
shuffle2 8,9,6,9
shuffle2 10,11,8,11

vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2
butterfly 3,5
butterfly 4,7
butterfly 6,9
butterfly 8,11

/* level 4 */
shuffle4 3,4,10,4
shuffle4 6,8,3,8
shuffle4 5,7,6,7
shuffle4 9,11,5,11

vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2
butterfly 10,4
butterfly 3,8
butterfly 6,7
butterfly 5,11

/* level 5 */
shuffle8 10,3,9,3
shuffle8 6,5,10,5
shuffle8 4,8,6,8
shuffle8 7,11,4,11

vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2
butterfly 9,3
butterfly 10,5
butterfly 6,8
butterfly 4,11

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm10,256*\off+ 32(%rdi)
vmovdqa %ymm6,256*\off+ 64(%rdi)
vmovdqa %ymm4,256*\off+ 96(%rdi)
vmovdqa %ymm3,256*\off+128(%rdi)
vmovdqa %ymm5,256*\off+160(%rdi)
vmovdqa %ymm8,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

ret
.macro levels6t7 off
vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

/* level 6 */
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

/* level 7 */
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)

vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1
vmovdqa (_8XDIV)*4(%rsi),%ymm2
vpmuldq %ymm1,%ymm4,%ymm12
vpmuldq %ymm1,%ymm5,%ymm13
vmovshdup %ymm4,%ymm8
vmovshdup %ymm5,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm4,%ymm4
vpmuldq %ymm2,%ymm5,%ymm5
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm4,%ymm4
vpsubd %ymm13,%ymm5,%ymm5
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm4,%ymm4
vmovshdup %ymm5,%ymm5
vpblendd $0xAA,%ymm8,%ymm4,%ymm4
vpblendd $0xAA,%ymm9,%ymm5,%ymm5

vpmuldq %ymm1,%ymm6,%ymm12
vpmuldq %ymm1,%ymm7,%ymm13
vmovshdup %ymm6,%ymm8
vmovshdup %ymm7,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm6,%ymm6
vpmuldq %ymm2,%ymm7,%ymm7
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm6,%ymm6
vpsubd %ymm13,%ymm7,%ymm7
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm6,%ymm6
vmovshdup %ymm7,%ymm7
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
vpblendd $0xAA,%ymm9,%ymm7,%ymm7

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
.endm

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
vmovdqa 256(%rsi),%ymm5
vmovdqa 512(%rsi),%ymm6
vmovdqa 768(%rsi),%ymm7
vmovdqa 1024(%rsi),%ymm8
vmovdqa 1280(%rsi),%ymm9
vmovdqa 1536(%rsi),%ymm10
vmovdqa 1792(%rsi),%ymm11

level5:
vpbroadcastd (%rdx),%ymm3
vpbroadcastd 4(%rdx),%ymm15
vpaddd %ymm2,%ymm4,%ymm12
vpaddd %ymm2,%ymm6,%ymm13
vpaddd %ymm2,%ymm8,%ymm14

vpsubd %ymm5,%ymm12,%ymm12
vpsubd %ymm7,%ymm13,%ymm13
vpsubd %ymm9,%ymm14,%ymm14

vpmuludq %ymm3,%ymm12,%ymm12
vpmuludq %ymm15,%ymm13,%ymm13
vpaddd %ymm2,%ymm10,%ymm15

vpsubd %ymm11,%ymm15,%ymm15
vpaddd %ymm4,%ymm5,%ymm4
vpaddd %ymm6,%ymm7,%ymm6
vpbroadcastd 8(%rdx),%ymm5
vpbroadcastd 12(%rdx),%ymm7

vpmuludq %ymm5,%ymm14,%ymm14
vpmuludq %ymm7,%ymm15,%ymm15
vpaddd %ymm8,%ymm9,%ymm8

vpaddd %ymm10,%ymm11,%ymm10

vpmuludq %ymm0,%ymm12,%ymm5
vpmuludq %ymm0,%ymm13,%ymm7
vpmuludq %ymm0,%ymm14,%ymm9
vpmuludq %ymm0,%ymm15,%ymm11
vpmuludq %ymm1,%ymm5,%ymm5
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm9,%ymm9
vpmuludq %ymm1,%ymm11,%ymm11
vpaddq %ymm12,%ymm5,%ymm5
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm9,%ymm9
vpaddq %ymm15,%ymm11,%ymm11
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm7,%ymm7
vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11

level6:
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 16(%rdx),%ymm15
vpbroadcastd 20(%rdx),%ymm3

butterfly 4,5,8,9,6,7,10,11

level7:
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 24(%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11,3,3

#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3

vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
vpmuludq %ymm3,%ymm6,%ymm6
vpmuludq %ymm3,%ymm7,%ymm7
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm0,%ymm6,%ymm14
vpmuludq %ymm0,%ymm7,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm12,%ymm4,%ymm4
vpaddq %ymm13,%ymm5,%ymm5
vpaddq %ymm14,%ymm6,%ymm6
vpaddq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm4,%ymm4
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7

#store
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6
vpermd %ymm7,%ymm3,%ymm7
vpermd %ymm8,%ymm3,%ymm8
vpermd %ymm9,%ymm3,%ymm9
vpermd %ymm10,%ymm3,%ymm10
vpermd %ymm11,%ymm3,%ymm11
vmovdqa %xmm4,(%rdi)
vmovdqa %xmm5,128(%rdi)
vmovdqa %xmm6,256(%rdi)
vmovdqa %xmm7,384(%rdi)
vmovdqa %xmm8,512(%rdi)
vmovdqa %xmm9,640(%rdi)
vmovdqa %xmm10,768(%rdi)
vmovdqa %xmm11,896(%rdi)
.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t5 0
levels0t5 1
levels0t5 2
levels0t5 3

levels6t7 0
levels6t7 1
levels6t7 2
levels6t7 3

ret

+ 177
- 157
crypto_sign/dilithium2/avx2/ntt.S Переглянути файл

@@ -1,179 +1,199 @@
#include "cdecl.h"
.include "shuffle.inc"
#include "cdecl.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
#mul
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1
vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2
vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3

#reduce
vpmuludq %ymm0,%ymm\rh0,%ymm12
vpmuludq %ymm0,%ymm\rh1,%ymm13
vpmuludq %ymm0,%ymm\rh2,%ymm14
vpmuludq %ymm0,%ymm\rh3,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm\rh0,%ymm12,%ymm12
vpaddq %ymm\rh1,%ymm13,%ymm13
vpaddq %ymm\rh2,%ymm14,%ymm14
vpaddq %ymm\rh3,%ymm15,%ymm15
vpsrlq $32,%ymm12,%ymm12
vpsrlq $32,%ymm13,%ymm13
vpsrlq $32,%ymm14,%ymm14
vpsrlq $32,%ymm15,%ymm15

#update
vpaddd %ymm2,%ymm\rl0,%ymm\rh0
vpaddd %ymm2,%ymm\rl1,%ymm\rh1
vpaddd %ymm2,%ymm\rl2,%ymm\rh2
vpaddd %ymm2,%ymm\rl3,%ymm\rh3
vpaddd %ymm12,%ymm\rl0,%ymm\rl0
vpaddd %ymm13,%ymm\rl1,%ymm\rl1
vpaddd %ymm14,%ymm\rl2,%ymm\rl2
vpaddd %ymm15,%ymm\rl3,%ymm\rl3
vpsubd %ymm12,%ymm\rh0,%ymm\rh0
vpsubd %ymm13,%ymm\rh1,%ymm\rh1
vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2

level0:
#zetas
vpbroadcastd (%rdx),%ymm3

#load
vpmovzxdq (%rsi),%ymm4
vpmovzxdq 128(%rsi),%ymm5
vpmovzxdq 256(%rsi),%ymm6
vpmovzxdq 384(%rsi),%ymm7
vpmovzxdq 512(%rsi),%ymm8
vpmovzxdq 640(%rsi),%ymm9
vpmovzxdq 768(%rsi),%ymm10
vpmovzxdq 896(%rsi),%ymm11

butterfly 4,5,6,7,8,9,10,11

level1:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13

butterfly 4,5,8,9,6,7,10,11,12,12,13,13

level2:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 12(%rdx),%ymm12
vpbroadcastd 16(%rdx),%ymm13
vpbroadcastd 20(%rdx),%ymm14
vpbroadcastd 24(%rdx),%ymm15

butterfly 4,6,8,10,5,7,9,11,12,13,14,15

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,256(%rdi)
vmovdqa %ymm6,512(%rdi)
vmovdqa %ymm7,768(%rdi)
vmovdqa %ymm8,1024(%rdi)
vmovdqa %ymm9,1280(%rdi)
vmovdqa %ymm10,1536(%rdi)
vmovdqa %ymm11,1792(%rdi)
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpmuldq %ymm\zl0,%ymm\h,%ymm13
vmovshdup %ymm\h,%ymm12
vpmuldq %ymm\zl1,%ymm12,%ymm14

ret
vpmuldq %ymm\zh0,%ymm\h,%ymm\h
vpmuldq %ymm\zh1,%ymm12,%ymm12

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vmovshdup %ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h

vpsubd %ymm\h,%ymm\l,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
vmovdqa 32(%rsi),%ymm5
vmovdqa 64(%rsi),%ymm6
vmovdqa 96(%rsi),%ymm7
vmovdqa 128(%rsi),%ymm8
vmovdqa 160(%rsi),%ymm9
vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11

level3:
#zetas
vpbroadcastd (%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11

level4:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13
vpblendd $0xF0,%ymm13,%ymm12,%ymm12
vmovshdup %ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13

vpaddd %ymm13,%ymm12,%ymm\h
vpsubd %ymm13,%ymm\l,%ymm\l
.endm

.macro levels0t1 off
/* level 0 */
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2

vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

/* level 1 */
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)
.endm

.macro levels2t7 off
/* level 2 */
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly 3,8,4,9,5,10,6,11,12,12,12,12
/* level 3 */
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2

level5:
#zetas
vpmovzxdq 12(%rdx),%ymm12
butterfly 3,5
butterfly 8,10
butterfly 4,6
butterfly 9,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly 7,5,3,10,8,6,4,11,12,12,12,12

level6:
#zetas
vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13

butterfly 7,5,8,6,3,10,4,11,12,12,13,13

level7:
#zetas
vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14
vpmovzxdq 108(%rdx),%ymm15

butterfly 7,3,8,4,5,10,6,11,12,13,14,15

#store
vpsllq $32,%ymm5,%ymm5
vpsllq $32,%ymm10,%ymm10
vpsllq $32,%ymm6,%ymm6
vpsllq $32,%ymm11,%ymm11
vpblendd $0xAA,%ymm5,%ymm7,%ymm7
vpblendd $0xAA,%ymm10,%ymm3,%ymm3
vpblendd $0xAA,%ymm6,%ymm8,%ymm8
vpblendd $0xAA,%ymm11,%ymm4,%ymm4
/* level 4 */
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2

butterfly 7,8
butterfly 5,6
butterfly 3,4
butterfly 10,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

/* level 5 */
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15

butterfly 9,5,1,10,2,15
butterfly 8,4,1,10,2,15
butterfly 7,3,1,10,2,15
butterfly 6,11,1,10,2,15

/* level 6 */
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,7,1,10,2,15
butterfly 8,6,1,10,2,15

vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,3,1,10,2,15
butterfly 4,11,1,10,2,15

/* level 7 */
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,8,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 7,6,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,4,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 3,11,1,10,2,15

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm8,256*\off+ 32(%rdi)
vmovdqa %ymm7,256*\off+ 64(%rdi)
vmovdqa %ymm6,256*\off+ 96(%rdi)
vmovdqa %ymm5,256*\off+128(%rdi)
vmovdqa %ymm4,256*\off+160(%rdi)
vmovdqa %ymm3,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

shuffle4 7,3,5,3
shuffle4 8,4,7,4
.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

shuffle8 5,7,6,7
shuffle8 3,4,5,4
levels0t1 0
levels0t1 1
levels0t1 2
levels0t1 3

vmovdqa %ymm6,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm4,96(%rdi)
levels2t7 0
levels2t7 1
levels2t7 2
levels2t7 3

ret


+ 8
- 30
crypto_sign/dilithium2/avx2/ntt.h Переглянути файл

@@ -1,36 +1,14 @@
#ifndef NTT_H
#define NTT_H
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTT_H
#define PQCLEAN_DILITHIUM2_AVX2_NTT_H

#include <stdint.h>
#include <immintrin.h>

#include "nttconsts.h"
#include "params.h"
void PQCLEAN_DILITHIUM2_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);
void PQCLEAN_DILITHIUM2_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);

void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(
uint64_t *tmp,
const uint32_t *a,
const uint32_t *zetas
);
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(
uint32_t *a,
const uint64_t *tmp,
const uint32_t *zetas
);
void PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx(__m256i *a);

void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(
uint64_t *tmp,
const uint32_t *a,
const uint32_t *zetas_inv
);
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(
uint32_t *a,
const uint64_t *tmp,
const uint32_t *zetas_inv
);

void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(
uint32_t *c, const uint32_t *a, const uint32_t *b);
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(
uint32_t *c, const uint32_t *a, const uint32_t *b);
void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata);

#endif

+ 0
- 80
crypto_sign/dilithium2/avx2/nttconsts.c Переглянути файл

@@ -1,80 +0,0 @@
#include "nttconsts.h"

#define QINV 4236238847 // -q^(-1) mod 2^32
#define MONT 4193792ULL
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)


const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};

#undef QINV
#undef MONT
#undef DIV


const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas = {
.as_arr = {
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776,
3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667,
5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191,
3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439,
7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422,
1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579,
8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390,
7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620,
5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868,
3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076,
6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435,
5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599,
3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165,
7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031,
6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064,
2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112,
162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237,
8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977,
1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735,
6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892,
5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443,
7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090,
4834730, 7018208, 1976782
}
};

const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv = {
.as_arr = {
6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985,
4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748,
2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646,
1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087,
177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422,
6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573,
5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061,
6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386,
1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252,
1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818,
1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496,
7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525,
6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443,
7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093,
5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647,
4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669,
43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330,
1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900,
6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326,
4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745,
3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293,
518909, 2608894, 3975713
}
};

+ 0
- 27
crypto_sign/dilithium2/avx2/nttconsts.h Переглянути файл

@@ -1,27 +0,0 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H
#define PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H

#include <immintrin.h>
#include <stdint.h>

#include "alignment.h"
#include "params.h"

typedef ALIGNED_UINT32(8) aligned_uint32x8_t;

typedef ALIGNED_UINT32(N) aligned_uint32xN_t;


extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv;

extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;

#endif //PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H


+ 108
- 144
crypto_sign/dilithium2/avx2/packing.c Переглянути файл

@@ -3,6 +3,7 @@
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk
*
@@ -12,17 +13,18 @@
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(
uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

@@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM2_AVX2_pack_pk(
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(
uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk
*
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t key[]: byte array containing key
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
* - const polyveck *t0: pointer to vector t0
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(
uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t key[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < SEEDBYTES; ++i) {
for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < CRHBYTES; ++i) {
for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]);
for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLETA_SIZE_PACKED;
sk += L * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLETA_SIZE_PACKED;
sk += K * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk
*
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t key[]: output byte array for key
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - const polyveck *r0: pointer to output vector t0
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(
uint8_t rho[SEEDBYTES],
uint8_t key[SEEDBYTES],
uint8_t tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < SEEDBYTES; ++i) {
for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < CRHBYTES; ++i) {
for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED);
for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLETA_SIZE_PACKED;
sk += L * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLETA_SIZE_PACKED;
sk += K * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig
*
* Description: Bit-pack signature sig = (z, h, c).
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
* - const poly *c: pointer to challenge polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(
uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const polyvecl *z,
const polyveck *h,
const poly *c) {
size_t k;
uint64_t signs, mask;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]);
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += L * POLZ_SIZE_PACKED;
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (size_t i = 0; i < K; ++i) {
for (size_t j = 0; j < N; ++j) {
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t)j;
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t)k;
}
while (k < OMEGA) {
sig[k++] = 0;
}
sig += OMEGA + K;

/* Encode c */
signs = 0;
mask = 1;
for (size_t i = 0; i < N / 8; ++i) {
sig[i] = 0;
for (size_t j = 0; j < 8; ++j) {
if (c->coeffs[8 * i + j] != 0) {
sig[i] |= (uint8_t)(1u << j);
if (c->coeffs[8 * i + j] == (Q - 1)) {
signs |= mask;
}
mask <<= 1;
}
}
}
sig += N / 8;
for (size_t i = 0; i < 8; ++i) {
sig[i] = (uint8_t)(signs >> 8u * i);
sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig
*
* Description: Unpack signature sig = (z, h, c).
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - polyvecl *z: pointer to output vector z
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - poly *c: pointer to output challenge polynomial
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) {
size_t k;
uint64_t signs;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED);
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += L * POLZ_SIZE_PACKED;
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (size_t i = 0; i < K; ++i) {
for (size_t j = 0; j < N; ++j) {
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

@@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(
return 1;
}

for (size_t j = k; j < sig[OMEGA + i]; ++j) {
for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
@@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(
}

/* Extra indices are zero for strong unforgeability */
for (size_t j = k; j < OMEGA; ++j) {
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

sig += OMEGA + K;

/* Decode c */
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

signs = 0;
for (size_t i = 0; i < 8; ++i) {
signs |= (uint64_t)sig[N / 8 + i] << 8 * i;
}

/* Extra sign bits are zero for strong unforgeability */
if (signs >> 60) {
return 1;
}

for (size_t i = 0; i < N / 8; ++i) {
for (size_t j = 0; j < 8; ++j) {
if ((sig[i] >> j) & 0x01) {
c->coeffs[8 * i + j] = 1;
c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
}
}

return 0;
}

+ 23
- 34
crypto_sign/dilithium2/avx2/packing.h Переглянути файл

@@ -1,42 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H
#define PQCLEAN_DILITHIUM2_AVX2_PACKING_H

#include "api.h"
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_pack_pk(
uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1);
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(
uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t key[SEEDBYTES],
const uint8_t tr[SEEDBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(
uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES],
const polyvecl *z, const polyveck *h, const poly *c);
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(
uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(
uint8_t rho[SEEDBYTES],
uint8_t key[SEEDBYTES],
uint8_t tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk);
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]);
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]);

#endif

+ 26
- 14
crypto_sign/dilithium2/avx2/params.h Переглянути файл

@@ -2,28 +2,40 @@
#define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define QBITS 23
#define D 14
#define GAMMA1 ((Q - 1)/16)
#define GAMMA2 (GAMMA1/2)
#define ALPHA (2*GAMMA2)
#define D 13
#define ROOT_OF_UNITY 1753

#define K 4
#define L 3
#define ETA 6
#define SETABITS 4
#define BETA 325
#define L 4
#define ETA 2
#define TAU 39
#define BETA 78
#define GAMMA1 (1 << 17)
#define GAMMA2 ((Q-1)/88)
#define OMEGA 80
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 576

#define POLYW1_PACKEDBYTES 192

#define POLYETA_PACKEDBYTES 96

#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8)
#define POLT0_SIZE_PACKED ((N*D)/8)
#define POLETA_SIZE_PACKED ((N*SETABITS)/8)
#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8)
#define POLW1_SIZE_PACKED ((N*4)/8)
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 80
- 71
crypto_sign/dilithium2/avx2/pointwise.S Переглянути файл

@@ -1,11 +1,14 @@
#include "params.h"
#include "cdecl.inc"
#include "cdecl.h"

.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop1:
@@ -18,41 +21,41 @@ vmovdqa 32(%rdx),%ymm12
vmovdqa 64(%rdx),%ymm14
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm6,%ymm7
vmovshdup %ymm6,%ymm7
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vpsrlq $32,%ymm14,%ymm15
vmovshdup %ymm14,%ymm15

#mul
vpmuludq %ymm2,%ymm10,%ymm2
vpmuludq %ymm3,%ymm11,%ymm3
vpmuludq %ymm4,%ymm12,%ymm4
vpmuludq %ymm5,%ymm13,%ymm5
vpmuludq %ymm6,%ymm14,%ymm6
vpmuludq %ymm7,%ymm15,%ymm7
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5
vpmuldq %ymm6,%ymm14,%ymm6
vpmuldq %ymm7,%ymm15,%ymm7

#reduce
vpmuludq %ymm0,%ymm2,%ymm10
vpmuludq %ymm0,%ymm3,%ymm11
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm0,%ymm6,%ymm14
vpmuludq %ymm0,%ymm7,%ymm15
vpmuludq %ymm1,%ymm10,%ymm10
vpmuludq %ymm1,%ymm11,%ymm11
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm2,%ymm10,%ymm2
vpaddq %ymm3,%ymm11,%ymm3
vpaddq %ymm4,%ymm12,%ymm4
vpaddq %ymm5,%ymm13,%ymm5
vpaddq %ymm6,%ymm14,%ymm6
vpaddq %ymm7,%ymm15,%ymm7
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm0,%ymm6,%ymm14
vpmuldq %ymm0,%ymm7,%ymm15
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpmuldq %ymm1,%ymm14,%ymm14
vpmuldq %ymm1,%ymm15,%ymm15
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsubq %ymm14,%ymm6,%ymm6
vpsubq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vpsrlq $32,%ymm6,%ymm6
vmovshdup %ymm6,%ymm6

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
@@ -67,7 +70,7 @@ add $96,%rsi
add $96,%rdx
add $1,%eax
cmp $10,%eax
jb _looptop1
jb _looptop1

vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
@@ -75,30 +78,30 @@ vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuludq %ymm2,%ymm10,%ymm2
vpmuludq %ymm3,%ymm11,%ymm3
vpmuludq %ymm4,%ymm12,%ymm4
vpmuludq %ymm5,%ymm13,%ymm5
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5

#reduce
vpmuludq %ymm0,%ymm2,%ymm10
vpmuludq %ymm0,%ymm3,%ymm11
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm1,%ymm10,%ymm10
vpmuludq %ymm1,%ymm11,%ymm11
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpaddq %ymm2,%ymm10,%ymm2
vpaddq %ymm3,%ymm11,%ymm3
vpaddq %ymm4,%ymm12,%ymm4
vpaddq %ymm5,%ymm13,%ymm5
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vmovshdup %ymm4,%ymm4

#store
vpblendd $0x55,%ymm2,%ymm3,%ymm2
@@ -116,14 +119,14 @@ vmovdqa \off(%rdx),%ymm10
vmovdqa \off+32(%rdx),%ymm12
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuludq %ymm6,%ymm10,%ymm6
vpmuludq %ymm7,%ymm11,%ymm7
vpmuludq %ymm8,%ymm12,%ymm8
vpmuludq %ymm9,%ymm13,%ymm9
vpmuldq %ymm6,%ymm10,%ymm6
vpmuldq %ymm7,%ymm11,%ymm7
vpmuldq %ymm8,%ymm12,%ymm8
vpmuldq %ymm9,%ymm13,%ymm9
.endm

.macro acc
@@ -134,10 +137,12 @@ vpaddq %ymm9,%ymm5,%ymm5
.endm

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop2:
@@ -155,23 +160,27 @@ acc
pointwise 2048
acc

pointwise 3072
acc




#reduce
vpmuludq %ymm0,%ymm2,%ymm6
vpmuludq %ymm0,%ymm3,%ymm7
vpmuludq %ymm0,%ymm4,%ymm8
vpmuludq %ymm0,%ymm5,%ymm9
vpmuludq %ymm1,%ymm6,%ymm6
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm8,%ymm8
vpmuludq %ymm1,%ymm9,%ymm9
vpaddq %ymm2,%ymm6,%ymm2
vpaddq %ymm3,%ymm7,%ymm3
vpaddq %ymm4,%ymm8,%ymm4
vpaddq %ymm5,%ymm9,%ymm5
vpmuldq %ymm0,%ymm2,%ymm6
vpmuldq %ymm0,%ymm3,%ymm7
vpmuldq %ymm0,%ymm4,%ymm8
vpmuldq %ymm0,%ymm5,%ymm9
vpmuldq %ymm1,%ymm6,%ymm6
vpmuldq %ymm1,%ymm7,%ymm7
vpmuldq %ymm1,%ymm8,%ymm8
vpmuldq %ymm1,%ymm9,%ymm9
vpsubq %ymm6,%ymm2,%ymm2
vpsubq %ymm7,%ymm3,%ymm3
vpsubq %ymm8,%ymm4,%ymm4
vpsubq %ymm9,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vmovshdup %ymm4,%ymm4

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2


+ 665
- 539
crypto_sign/dilithium2/avx2/poly.c
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 35
- 39
crypto_sign/dilithium2/avx2/poly.h Переглянути файл

@@ -1,19 +1,14 @@
#ifndef POLY_H
#define POLY_H

#include <immintrin.h>
#include <stdint.h>

#include "alignment.h"
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLY_H
#define PQCLEAN_DILITHIUM2_AVX2_POLY_H
#include "align.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

typedef union {
uint32_t coeffs[N];
__m256i coeffs_x8[N / 8];
} poly;
typedef ALIGNED_INT32(N) poly;

void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b);
@@ -21,63 +16,64 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h);
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
const uint8_t seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
const uint8_t seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0,
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
const uint8_t seed[CRHBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);

void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t *a);
void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t *a);
void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]);

void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t *a);
void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a);

void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t *r, const poly *a);
#endif

+ 218
- 67
crypto_sign/dilithium2/avx2/polyvec.c Переглянути файл

@@ -1,14 +1,103 @@
#include <stdint.h>

#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

#define UNUSED(x) (void)x

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(&mat[0], NULL, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(&mat[1], NULL, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(&mat[2], NULL, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 256, 257, 258, 259);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 512, 513, 514, 515);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) {
UNUSED(rowb);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 768, 769, 770, 771);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]);
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]);
}


void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze
*
@@ -18,7 +107,9 @@
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) {
for (size_t i = 0; i < L; ++i) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]);
}
}
@@ -34,7 +125,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) {
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
for (size_t i = 0; i < L; ++i) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}
@@ -48,44 +141,60 @@ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) {
for (size_t i = 0; i < L; ++i) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
* Input coefficients are assumed to be less than 22*Q. Output
* coeffcient are less than 2*L*Q.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w,
const polyvecl *u,
const polyvecl *v) {
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) {
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input coefficients to be standard representatives.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - uint32_t B: norm bound
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B and 1
* otherwise.
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
for (size_t i = 0; i < L; ++i) {
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
@@ -98,37 +207,48 @@ int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound)
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [0,2*Q[.
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* subtract Q if coefficient is bigger than Q.
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_csubq(&v->vec[i]);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&v->vec[i]);
}
}

/*************************************************
* Name: polyveck_freeze
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
@@ -136,7 +256,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) {
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]);
}
}
@@ -152,7 +274,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) {
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}
@@ -161,8 +285,7 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* Assumes coefficients of polynomials in second input vector
* to be less than 2*Q. No modular reduction is performed.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
@@ -170,7 +293,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}
@@ -179,12 +304,14 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{32-D}.
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]);
}
}
@@ -198,13 +325,15 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) {
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
@@ -212,9 +341,19 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) {
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&v->vec[i]);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

@@ -222,16 +361,18 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) {
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input coefficients to be standard representatives.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - uint32_t B: norm bound
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B and 1
* otherwise.
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) {
for (size_t i = 0; i < K; ++i) {
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
@@ -244,18 +385,20 @@ int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound)
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod Q = a1*2^D + a0
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}
@@ -264,7 +407,7 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
@@ -272,12 +415,13 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(
polyveck *v1, polyveck *v0, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}
@@ -287,37 +431,44 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(
*
* Description: Compute hint vector.
*
* Arguments: - polyveck *h: pointer to output vector
* Arguments: - uint8_t *hint: pointer to output hint array
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(
polyveck *h,
const polyveck *v0,
const polyveck *v1) {
uint32_t s = 0;

for (size_t i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) {
unsigned int i, n = 0;

for (i = 0; i < K; ++i) {
n += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]);
}

return s;
return n;
}

/*************************************************
* Name: polyveck_use_hint
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *v: pointer to input vector
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) {
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 42
- 28
crypto_sign/dilithium2/avx2/polyvec.h Переглянути файл

@@ -1,58 +1,72 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H
#define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H

#include <stdint.h>

#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(
poly *w, const polyvecl *u, const polyvecl *v);
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B);

/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(
polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(
polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v);

int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(
const polyveck *v, uint32_t B);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(
polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(
polyveck *v1, polyveck *v0, const polyveck *v);
uint32_t PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(
polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(
polyveck *w, const polyveck *v, const polyveck *h);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 0
- 93
crypto_sign/dilithium2/avx2/reduce.S Переглянути файл

@@ -1,93 +0,0 @@
#include "cdecl.inc"

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0

xor %eax,%eax
_looptop_rdc32:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7

#reduce
vpsrld $23,%ymm1,%ymm2
vpsrld $23,%ymm3,%ymm4
vpsrld $23,%ymm5,%ymm6
vpsrld $23,%ymm7,%ymm8
vpand %ymm0,%ymm1,%ymm1
vpand %ymm0,%ymm3,%ymm3
vpand %ymm0,%ymm5,%ymm5
vpand %ymm0,%ymm7,%ymm7
vpsubd %ymm2,%ymm1,%ymm1
vpsubd %ymm4,%ymm3,%ymm3
vpsubd %ymm6,%ymm5,%ymm5
vpsubd %ymm8,%ymm7,%ymm7
vpslld $13,%ymm2,%ymm2
vpslld $13,%ymm4,%ymm4
vpslld $13,%ymm6,%ymm6
vpslld $13,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)

add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_rdc32

ret

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
#consts
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0

xor %eax,%eax
_looptop_csubq:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7

#cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq)
vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5
vpsubd %ymm0,%ymm7,%ymm7
vpsrad $31,%ymm1,%ymm2
vpsrad $31,%ymm3,%ymm4
vpsrad $31,%ymm5,%ymm6
vpsrad $31,%ymm7,%ymm8
vpand %ymm0,%ymm2,%ymm2
vpand %ymm0,%ymm4,%ymm4
vpand %ymm0,%ymm6,%ymm6
vpand %ymm0,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)

add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_csubq

ret

+ 0
- 9
crypto_sign/dilithium2/avx2/reduce.h Переглянути файл

@@ -1,9 +0,0 @@
#ifndef REDUCE_H
#define REDUCE_H

#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_reduce_avx(uint32_t a[N]);
void PQCLEAN_DILITHIUM2_AVX2_csubq_avx(uint32_t a[N]);

#endif

+ 111
- 144
crypto_sign/dilithium2/avx2/rejsample.c Переглянути файл

@@ -1,9 +1,10 @@
#include <immintrin.h>

#include "params.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>

static const uint8_t idx[256][8] = {
const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
@@ -262,178 +263,144 @@ static const uint8_t idx[256][8] = {
{ 0, 1, 2, 3, 4, 5, 6, 7}
};

uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_uniform(
uint32_t *r,
size_t len,
const uint8_t *buf,
size_t buflen) {
uint32_t ctr, pos, vec[8];
__m256i d, tmp;
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
unsigned int ctr, pos;
uint32_t good;
__m256i d, tmp;
const __m256i bound = _mm256_set1_epi32(Q);
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
-1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6,
-1, 5, 4, 3, -1, 2, 1, 0);

ctr = pos = 0;
while (ctr + 8 <= len && pos + 24 <= buflen) {
for (size_t i = 0; i < 8; i++) {
vec[i] = buf[pos++];
vec[i] |= (uint32_t)buf[pos++] << 8;
vec[i] |= (uint32_t)buf[pos++] << 16;
vec[i] &= 0x7FFFFF;
}
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
d = _mm256_permute4x64_epi64(d, 0x94);
d = _mm256_shuffle_epi8(d, idx8);
d = _mm256_and_si256(d, mask);
pos += 24;

d = _mm256_loadu_si256((__m256i_u *)vec);
tmp = _mm256_cmpgt_epi32(bound, d);
tmp = _mm256_sub_epi32(d, bound);
good = _mm256_movemask_ps((__m256)tmp);

__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]);
tmp = _mm256_cvtepu8_epi32(rid);
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]));
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i_u *)&r[ctr], d);
ctr += __builtin_popcount(good);
}

while (ctr < len && pos + 3 <= buflen) {
vec[0] = buf[pos++];
vec[0] |= (uint32_t)buf[pos++] << 8;
vec[0] |= (uint32_t)buf[pos++] << 16;
vec[0] &= 0x7FFFFF;
_mm256_storeu_si256((__m256i *)&r[ctr], d);
ctr += _mm_popcnt_u32(good);

if (vec[0] < Q) {
r[ctr++] = vec[0];
if (ctr > N - 8) {
break;
}
}

return ctr;
}

uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_eta(
uint32_t *r,
size_t len,
const uint8_t *buf,
size_t buflen) {
uint32_t ctr, pos;
uint8_t vec[32];
__m256i tmp0, tmp1;
__m128i d0, d1, rid;
uint32_t good;
const __m256i bound = _mm256_set1_epi8(2 * ETA + 1);
const __m256i off = _mm256_set1_epi32(Q + ETA);
uint32_t t;
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

ctr = pos = 0;
while (ctr + 32 <= len && pos + 16 <= buflen) {
for (size_t i = 0; i < 16; i++) {
vec[2 * i + 0] = buf[pos] & 0x0F;
vec[2 * i + 1] = buf[pos++] >> 4;
}

tmp0 = _mm256_loadu_si256((__m256i_u *)vec);
tmp1 = _mm256_cmpgt_epi8(bound, tmp0);
good = _mm256_movemask_epi8(tmp1);

d0 = _mm256_castsi256_si128(tmp0);
rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount(good & 0xFF);

d0 = _mm_bsrli_si128(d0, 8);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 8) & 0xFF);

d0 = _mm256_extracti128_si256(tmp0, 1);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 16) & 0xFF);

d0 = _mm_bsrli_si128(d0, 8);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 24) & 0xFF);
}

while (ctr < len && pos < buflen) {
vec[0] = buf[pos] & 0x0F;
vec[1] = buf[pos++] >> 4;

if (vec[0] <= 2 * ETA) {
r[ctr++] = Q + ETA - vec[0];
}
if (vec[1] <= 2 * ETA && ctr < len) {
r[ctr++] = Q + ETA - vec[1];
if (t < Q) {
r[ctr++] = t;
}
}

return ctr;
}

uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(
uint32_t *r,
size_t len,
const uint8_t *buf,
size_t buflen) {
uint32_t ctr, pos;
uint32_t vec[8];
__m256i d, tmp;
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
unsigned int ctr, pos;
uint32_t good;
const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1);
const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1);
__m256i f0, f1, f2;
__m128i g0, g1;
const __m256i mask = _mm256_set1_epi8(15);
const __m256i eta = _mm256_set1_epi8(ETA);
const __m256i bound = mask;
const __m256i v = _mm256_set1_epi32(-6560);
const __m256i p = _mm256_set1_epi32(5);

ctr = pos = 0;
while (ctr + 8 <= len && pos + 20 <= buflen) {
for (size_t i = 0; i < 4; i++) {
vec[2 * i + 0] = buf[pos + 0];
vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8;
vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16;
vec[2 * i + 0] &= 0xFFFFF;
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
f1 = _mm256_slli_epi16(f0, 4);
f0 = _mm256_or_si256(f0, f1);
f0 = _mm256_and_si256(f0, mask);

vec[2 * i + 1] = buf[pos + 2] >> 4;
vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4;
vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12;
f1 = _mm256_sub_epi8(f0, bound);
f0 = _mm256_sub_epi8(eta, f0);
good = _mm256_movemask_epi8(f1);

pos += 5;
g0 = _mm256_castsi256_si128(f0);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

d = _mm256_loadu_si256((__m256i_u *)vec);
tmp = _mm256_cmpgt_epi32(bound, d);
good = _mm256_movemask_ps((__m256)tmp);
d = _mm256_sub_epi32(off, d);
if (ctr > N - 8) {
break;
}
g0 = _mm256_extracti128_si256(f0, 1);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]);
tmp = _mm256_cvtepu8_epi32(rid);
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i_u *)&r[ctr], d);
ctr += __builtin_popcount(good);
if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good);
pos += 4;
}

while (ctr < len && pos + 5 <= buflen) {
vec[0] = buf[pos + 0];
vec[0] |= (uint32_t)buf[pos + 1] << 8;
vec[0] |= (uint32_t)buf[pos + 2] << 16;
vec[0] &= 0xFFFFF;

vec[1] = buf[pos + 2] >> 4;
vec[1] |= (uint32_t)buf[pos + 3] << 4;
vec[1] |= (uint32_t)buf[pos + 4] << 12;

pos += 5;
uint32_t t0, t1;
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (vec[0] <= 2 * GAMMA1 - 2) {
r[ctr++] = Q + GAMMA1 - 1 - vec[0];
if (t0 < 15) {
t0 = t0 - (205 * t0 >> 10) * 5;
r[ctr++] = 2 - t0;
}
if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) {
r[ctr++] = Q + GAMMA1 - 1 - vec[1];
if (t1 < 15 && ctr < N) {
t1 = t1 - (205 * t1 >> 10) * 5;
r[ctr++] = 2 - t1;
}
}



+ 13
- 19
crypto_sign/dilithium2/avx2/rejsample.h Переглянути файл

@@ -1,25 +1,19 @@
#ifndef REJSAMPLE_H
#define REJSAMPLE_H
#include <stddef.h>
#ifndef PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H
#define PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_uniform(
uint32_t *r,
size_t len,
const uint8_t *buf,
size_t buflen);
#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES)

#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)

extern const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8];

uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_eta(
uint32_t *r,
size_t len,
const uint8_t *buf,
size_t buflen);
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]);

uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(
uint32_t *r,
size_t len,
const uint8_t *buf,
size_t buflen);
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]);

#endif

+ 119
- 77
crypto_sign/dilithium2/avx2/rounding.c Переглянути файл

@@ -1,115 +1,157 @@
#include "consts.h"
#include "params.h"
#include "rejsample.h"
#include "rounding.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

#define _mm256_blendv_epi32(a,b,mask) \
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
_mm256_castsi256_ps(b), \
_mm256_castsi256_ps(mask)))

/*************************************************
* Name: power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be standard representative.
* Description: For finite field elements a, compute a0, a1 such that
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be positive standard representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
* Arguments: - __m256i *a1: output array of length N/8 with high bits
* - __m256i *a0: output array of length N/8 with low bits a0
* - const __m256i *a: input array of length N/8
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0) {
int32_t t;
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1;
const __m256i mask = _mm256_set1_epi32(-(1 << D));
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1);

/* Centralized remainder mod 2^D */
t = a & ((1U << D) - 1);
t -= (1U << (D - 1)) + 1;
t += (t >> 31) & (1U << D);
t -= (1U << (D - 1)) - 1;
*a0 = Q + t;
a = (a - t) >> D;
return a;
for (i = 0; i < N / 8; ++i) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, half);
f0 = _mm256_and_si256(f1, mask);
f1 = _mm256_srli_epi32(f1, D);
f0 = _mm256_sub_epi32(f, f0);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_decompose
* Name: decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* Description: For finite field element a, compute high and low parts a0, a1 such
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard
* representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
* Arguments: - __m256i *a1: output array of length N/8 with high parts
* - __m256i *a0: output array of length N/8 with low parts a0
* - const __m256i *a: input array of length N/8
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0) {
int32_t t, u;

/* Centralized remainder mod ALPHA */
t = a & 0x7FFFF;
t += (a >> 19) << 9;
t -= ALPHA / 2 + 1;
t += (t >> 31) & ALPHA;
t -= ALPHA / 2 - 1;
a -= t;
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1, t;
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]);
const __m256i hq = _mm256_srli_epi32(q, 1);
const __m256i v = _mm256_set1_epi32(11275);
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2);
const __m256i off = _mm256_set1_epi32(127);
const __m256i shift = _mm256_set1_epi32(128);
const __m256i max = _mm256_set1_epi32(43);
const __m256i zero = _mm256_setzero_si256();

/* Divide by ALPHA (possible to avoid) */
u = a - 1;
u >>= 31;
a = (a >> 19) + 1;
a -= u & 1;

/* Border case */
*a0 = Q + t - (a >> 4);
a &= 0xF;
return a;
for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, off);
f1 = _mm256_srli_epi32(f1, 7);
f1 = _mm256_mulhi_epu16(f1, v);
f1 = _mm256_mulhrs_epi16(f1, shift);
t = _mm256_sub_epi32(max, f1);
f1 = _mm256_blendv_epi32(f1, zero, t);
f0 = _mm256_mullo_epi32(f1, alpha);
f0 = _mm256_sub_epi32(f, f0);
f = _mm256_cmpgt_epi32(f0, hq);
f = _mm256_and_si256(f, q);
f0 = _mm256_sub_epi32(f0, f);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_make_hint
* Name: make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits. Inputs assumed to be
* standard representatives.
* Description: Compute indices of polynomial coefficients whose low bits
* overflow into the high bits.
*
* Arguments: - uint32_t a0: low bits of input element
* - uint32_t a1: high bits of input element
* Arguments: - uint8_t *hint: hint array
* - const __m256i *a0: low bits of input elements
* - const __m256i *a1: high bits of input elements
*
* Returns 1 if high bits of a and b differ and 0 otherwise.
* Returns number of overflowing low bits
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(const uint32_t a0, const uint32_t a1) {
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) {
return 0;
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) {
unsigned int i, n = 0;
__m256i f0, f1, g0, g1;
uint32_t bad;
uint64_t idx;
const __m256i low = _mm256_set1_epi32(-GAMMA2);
const __m256i high = _mm256_set1_epi32(GAMMA2);

for (i = 0; i < N / 8; ++i) {
f0 = _mm256_load_si256(&a0[i]);
f1 = _mm256_load_si256(&a1[i]);
g0 = _mm256_abs_epi32(f0);
g0 = _mm256_cmpgt_epi32(g0, high);
g1 = _mm256_cmpeq_epi32(f0, low);
g1 = _mm256_sign_epi32(g1, f1);
g0 = _mm256_or_si256(g0, g1);

bad = _mm256_movemask_ps((__m256)g0);
memcpy(&idx, PQCLEAN_DILITHIUM2_AVX2_idxlut[bad], 8);
idx += (uint64_t)0x0808080808080808 * i;
memcpy(&hint[n], &idx, 8);
n += _mm_popcnt_u32(bad);
}

return 1;
return n;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_use_hint
* Name: use_hint
*
* Description: Correct high bits according to hint.
* Description: Correct high parts according to hint.
*
* Arguments: - uint32_t a: input element
* - unsigned int hint: hint bit
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts
* - const __m256i *a: input array of length N/8
* - const __m256i *a: input array of length N/8 with hint bits
*
* Returns corrected high bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(const uint32_t a, const unsigned int hint) {
uint32_t a0, a1;
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) {
unsigned int i;
__m256i a0[N / 8];
__m256i f, g, h, t;
const __m256i zero = _mm256_setzero_si256();
const __m256i max = _mm256_set1_epi32(43);

a1 = PQCLEAN_DILITHIUM2_AVX2_decompose(a, &a0);
if (hint == 0) {
return a1;
PQCLEAN_DILITHIUM2_AVX2_decompose_avx(b, a0, a);
for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a0[i]);
g = _mm256_load_si256(&b[i]);
h = _mm256_load_si256(&hint[i]);
t = _mm256_blendv_epi32(zero, h, f);
t = _mm256_slli_epi32(t, 1);
h = _mm256_sub_epi32(h, t);
g = _mm256_add_epi32(g, h);
g = _mm256_blendv_epi32(g, max, g);
f = _mm256_cmpgt_epi32(g, max);
g = _mm256_blendv_epi32(g, zero, f);
_mm256_store_si256(&b[i], g);
}
if (a0 > Q) {
return (a1 + 1) & 0xF;
}
return (a1 - 1) & 0xF;

/* If decompose does not divide out ALPHA:
if(hint == 0)
return a1;
else if(a0 > Q)
return (a1 + ALPHA) % (Q - 1);
else
return (a1 - ALPHA) % (Q - 1);
*/
}

+ 7
- 7
crypto_sign/dilithium2/avx2/rounding.h Переглянути файл

@@ -1,12 +1,12 @@
#ifndef ROUNDING_H
#define ROUNDING_H

#ifndef PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H
#define PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H
#include "params.h"
#include <immintrin.h>
#include <stdint.h>

uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0);
uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0);
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(uint32_t a0, uint32_t a1);
uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(uint32_t a, unsigned int hint);
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a);
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a);
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1);
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint);

#endif

+ 54
- 0
crypto_sign/dilithium2/avx2/shuffle.S Переглянути файл

@@ -0,0 +1,54 @@
#include "cdecl.h"
.include "shuffle.inc"

.text
nttunpack128_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

#store
vmovdqa %ymm9,(%rdi)
vmovdqa %ymm8,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm6,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm3,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx):
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
ret

+ 10
- 8
crypto_sign/dilithium2/avx2/shuffle.inc Переглянути файл

@@ -9,15 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm12
vpsrld $16,%ymm\r0,%ymm13
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

+ 248
- 266
crypto_sign/dilithium2/avx2/sign.c Переглянути файл

@@ -1,6 +1,4 @@
#include <stdint.h>
#include <string.h>

#include "align.h"
#include "fips202.h"
#include "packing.h"
#include "params.h"
@@ -9,93 +7,28 @@
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"
#include <stdint.h>
#include <string.h>

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|i|j).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[4], const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[0].vec[0],
&mat[0].vec[1],
&mat[0].vec[2],
&mat[1].vec[0],
rho, 0, 1, 2, 256);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[1].vec[1],
&mat[1].vec[2],
&mat[2].vec[0],
&mat[2].vec[1],
rho, 257, 258, 512, 513);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[2].vec[2],
&mat[3].vec[0],
&mat[3].vec[1],
&mat[3].vec[2],
rho, 514, 768, 769, 770);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_challenge
*
* Description: Implementation of H. Samples polynomial with 60 nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(mu|w1).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing mu
* - const polyveck *w1: pointer to vector w1
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c,
const uint8_t mu[CRHBYTES],
const polyveck *w1) {
uint8_t b;
size_t pos;
uint64_t signs;
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
uint8_t outbuf[SHAKE256_RATE];
shake256ctx state;

for (size_t i = 0; i < CRHBYTES; ++i) {
inbuf[i] = mu[i];
}
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]);
}

shake256_absorb(&state, inbuf, sizeof(inbuf));
shake256_squeezeblocks(outbuf, 1, &state);

signs = 0;
for (size_t i = 0; i < 8; ++i) {
signs |= (uint64_t) outbuf[i] << 8 * i;
}

pos = 8;

for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

for (size_t i = 196; i < 256; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_squeezeblocks(outbuf, 1, &state);
pos = 0;
}

b = outbuf[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1;
c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) {
switch (i) {
case 0:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho);
*row = buf;
break;
case 1:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho);
*row = buf + 1;
break;
case 2:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho);
*row = buf;
break;
case 3:
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho);
*row = buf + 1;
break;
}
shake256_ctx_release(&state);
}

/*************************************************
@@ -104,56 +37,69 @@ void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c,
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes)
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
uint16_t nonce = 0;
polyvecl mat[K];
polyvecl s1, s1hat;
polyveck s2, t, t1, t0;

/* Expand 32 bytes of randomness into rho, rhoprime and key */
randombytes(seedbuf, 3 * SEEDBYTES);
polyvecl rowbuf[2];
polyvecl s1, *row = rowbuf;
polyveck s2;
poly t1, t0;

/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Expand matrix */
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho);
/* Store rho, key */
memcpy(pk, rho, SEEDBYTES);
memcpy(sk, rho, SEEDBYTES);
memcpy(sk + SEEDBYTES, key, SEEDBYTES);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s2.vec[0], rhoprime,
nonce, nonce + 1, nonce + 2, nonce + 3);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[1], &s2.vec[2], &s2.vec[3], &t.vec[0], rhoprime,
nonce + 4, nonce + 5, nonce + 6, 0);

/* Matrix-vector multiplication */
s1hat = s1;
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1hat);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat);
//PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&t.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&t.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, 4, 5, 6, 7);

/* Pack secret vectors */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]);
}
for (i = 0; i < K; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]);
}

/* Transform s1 */
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1);

/* Add error vector s2 */
PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&t, &t, &s2);

/* Extract t1 and write public key */
PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&t);
PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(&t1, &t0, &t);
PQCLEAN_DILITHIUM2_AVX2_pack_pk(pk, rho, &t1);
for (i = 0; i < K; i++) {
/* Expand matrix row */
polyvec_matrix_expand_row(&row, rowbuf, rho, i);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM2_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0);
/* Compute inner-product */
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&t1);

/* Add error polynomial */
PQCLEAN_DILITHIUM2_AVX2_poly_add(&t1, &t1, &s2.vec[i]);

/* Round t and pack t1, t0 */
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&t1);
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1);
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0);
}

/* Compute CRH(rho, t1) and store in secret key */
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);

return 0;
}
@@ -161,42 +107,40 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature
*
* Description: Compute signed message.
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES
* of len)
* - size_t *siglen: pointer to output length of signed message
* (should be PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES)
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
uint32_t n;
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
unsigned int i, n, pos;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint16_t nonce = 0;
poly c, chat;
polyvecl mat[K], s1, y, yhat, z;
polyveck t0, s2, w, w1, w0;
polyveck h, cs2, ct0;
uint8_t hintbuf[N];
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
uint64_t nonce = 0;
polyvecl mat[K], s1, z;
polyveck t0, s2, w1;
poly c, tmp;
union {
polyvecl y;
polyveck w0;
} tmpv;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);

PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

// use incremental hash API instead of copying around buffers
/* Compute CRH(tr, m) */
shake256incctx state;
/* Compute CRH(tr, msg) */
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
@@ -207,76 +151,88 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(
crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0);


rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &yhat.vec[0],
rhoprime, nonce, nonce + 1, nonce + 2, 0);
nonce += 3;

/* Matrix-vector multiplication */
yhat = y;
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&yhat);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&w.vec[i]);
}
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3],
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3);
nonce += 4;

/* Matrix-vector product */
tmpv.y = z;
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&tmpv.y);
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y);
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w);
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &w0, &w);
PQCLEAN_DILITHIUM2_AVX2_challenge(&c, mu, &w1);
chat = c;
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat);

/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&cs2.vec[i]);
}
PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&w0, &w0, &cs2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&w0);
if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}
PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(sig, &w1);

shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c);

/* Compute z, reject if it reveals secret */
for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&z.vec[i]);
}
PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(&z, &z, &y);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(&z);
if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
goto rej;
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&z.vec[i]);
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) {
goto rej;
}
}

/* Compute hints for w1 */
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&ct0.vec[i]);
}
/* Zero hint vector in signature */
pos = 0;
memset(hint, 0, OMEGA);

for (i = 0; i < K; i++) {
/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmpv.w0.vec[i]);
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) {
goto rej;
}

/* Compute hints */
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmp);
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmp, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&ct0);
if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&ct0, GAMMA2)) {
goto rej;
PQCLEAN_DILITHIUM2_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
n = PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]);
if (pos + n > OMEGA) {
goto rej;
}

/* Store hints in signature */
memcpy(&hint[pos], hintbuf, n);
hint[OMEGA + i] = pos = pos + n;
}

PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&w0, &w0, &ct0);
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w0);
n = PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(&h, &w0, &w1);
if (n > OMEGA) {
goto rej;
/* Pack z into signature */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]);
}

/* Write signature */
PQCLEAN_DILITHIUM2_AVX2_pack_sig(sig, &z, &h, &c);
*siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;
return 0;
}
@@ -290,63 +246,55 @@ rej:
* array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
int rc;
memmove(sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, m, mlen);
rc = PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk);
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
size_t i;
for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return rc;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify
*
* Description: Verify signed message.
* Description: Verifies signature.
*
* Arguments: - uint8_t *sig: signature
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES)
* - uint8_t *m: pointer to message
* - size_t *mlen: pointer to output length of message
* - uint8_t *pk: pointer to bit-packed public key
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk) {
uint8_t rho[SEEDBYTES];
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) {
unsigned int i, j, pos = 0;
/* PQCLEAN_DILITHIUM2_AVX2_polyw1_pack writes additional 14 bytes */
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf;
uint8_t mu[CRHBYTES];
poly c, chat, cp;
polyvecl mat[K], z;
polyveck t1, w1, h, tmp1, tmp2;

if (siglen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
return -1;
}
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
polyvecl rowbuf[2];
polyvecl *row = rowbuf;
polyvecl z;
poly c, w1, h;
shake256incctx state;

PQCLEAN_DILITHIUM2_AVX2_unpack_pk(rho, &t1, pk);
if (PQCLEAN_DILITHIUM2_AVX2_unpack_sig(&z, &h, &c, sig)) {
return -1;
}
if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
if (siglen != PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);

shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
@@ -354,33 +302,69 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&z);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z);
}
/* Expand PQCLEAN_DILITHIUM2_AVX2_challenge */
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c);

chat = c;
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat);
PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(&t1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t1);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]);
/* Unpack z; shortness follows from unpacking */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&z.vec[i]);
}

PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(&tmp1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(&tmp1);

/* Reconstruct w1 */
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&tmp1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(&w1, &tmp1, &h);
for (i = 0; i < K; i++) {
/* Expand matrix row */
polyvec_matrix_expand_row(&row, rowbuf, pk, i);

/* Call random oracle and verify challenge */
PQCLEAN_DILITHIUM2_AVX2_challenge(&cp, mu, &w1);
for (size_t i = 0; i < N; ++i) {
if (c.coeffs[i] != cp.coeffs[i]) {
/* Compute i-th row of Az - c2^Dt1 */
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z);

PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES);
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&h);
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&h);
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&h, &c, &h);

PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w1, &w1, &h);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w1);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&w1);

/* Get hint polynomial and reconstruct w1 */
memset(h.vec, 0, sizeof(poly));
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) {
return -1;
}

for (j = pos; j < hint[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > pos && hint[j] <= hint[j - 1]) {
return -1;
}
h.coeffs[hint[j]] = 1;
}
pos = hint[OMEGA + i];

PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&w1);
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1);
}

/* Extra indices are zero for strong unforgeability */
for (j = pos; j < OMEGA; ++j) {
if (hint[j]) {
return -1;
}
}

/* Call random oracle and verify PQCLEAN_DILITHIUM2_AVX2_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (buf.coeffs[i] != sig[i]) {
return -1;
}
}
@@ -394,7 +378,7 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
@@ -402,30 +386,28 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk) {
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
goto badsig;
}
*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;

if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES,
sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) {
*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (size_t i = 0; i < *mlen; ++i) {
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i];
}
return 0;
}

/* Signature verification failed */
badsig:
*mlen = (size_t) -1;
for (size_t i = 0; i < smlen; ++i) {
/* Signature verification failed */
*mlen = -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}



+ 22
- 8
crypto_sign/dilithium2/avx2/sign.h Переглянути файл

@@ -1,15 +1,29 @@
#ifndef SIGN_H
#define SIGN_H

#include "api.h"
#ifndef PQCLEAN_DILITHIUM2_AVX2_SIGN_H
#define PQCLEAN_DILITHIUM2_AVX2_SIGN_H
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES],
const polyveck *w1);
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

#endif
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

+ 0
- 26
crypto_sign/dilithium2/avx2/stream.c Переглянути файл

@@ -1,26 +0,0 @@
#include "stream.h"

#include <string.h>

void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {

uint8_t buf[SEEDBYTES + 2];
memcpy(buf, seed, SEEDBYTES);
buf[SEEDBYTES] = (uint8_t)nonce;
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8);

shake128_absorb(state, buf, SEEDBYTES + 2);
}


void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {

uint8_t buf[CRHBYTES + 2];
memcpy(buf, seed, CRHBYTES);
buf[CRHBYTES] = (uint8_t)nonce;
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8);

shake256_absorb(state, buf, CRHBYTES + 2);
}

+ 0
- 15
crypto_sign/dilithium2/avx2/stream.h Переглянути файл

@@ -1,15 +0,0 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_STREAM_H
#define PQCLEAN_DILITHIUM2_AVX2_STREAM_H

#include <stdint.h>

#include "fips202.h"
#include "params.h"

void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce);

#endif

+ 26
- 0
crypto_sign/dilithium2/avx2/symmetric-shake.c Переглянути файл

@@ -0,0 +1,26 @@
#include "fips202.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake128_inc_init(state);
shake128_inc_absorb(state, seed, SEEDBYTES);
shake128_inc_absorb(state, t, 2);
shake128_inc_finalize(state);
}

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake256_inc_init(state);
shake256_inc_absorb(state, seed, CRHBYTES);
shake256_inc_absorb(state, t, 2);
shake256_inc_finalize(state);
}

+ 23
- 12
crypto_sign/dilithium2/avx2/symmetric.h Переглянути файл

@@ -1,25 +1,36 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H
#include "fips202.h"
#include "params.h"
#include "stream.h"
#include <stdint.h>


#include "fips202.h"

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream128_ctx_release(STATE) shake128_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream256_ctx_release(STATE) shake256_ctx_release(STATE)
typedef shake128incctx stream128_state;
typedef shake256incctx stream256_state;

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state,
const uint8_t seed[CRHBYTES],
uint16_t nonce);

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

typedef shake128ctx stream128_state;
typedef shake256ctx stream256_state;
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE)
#define stream128_release(STATE) shake128_inc_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE)
#define stream256_release(STATE) shake256_inc_ctx_release(STATE)


#endif

+ 3
- 4
crypto_sign/dilithium2/clean/LICENSE Переглянути файл

@@ -1,6 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and the random number generator
we are using public-domain code from sources
and by authors listed in comments on top of
the respective files.
For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 3
- 6
crypto_sign/dilithium2/clean/Makefile Переглянути файл

@@ -1,13 +1,10 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libdilithium2_clean.a
HEADERS=api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h
OBJECTS=ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o

SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c
OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o
HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \
reduce.h rounding.h symmetric.h stream.h

CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)



+ 8
- 3
crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake Переглянути файл

@@ -2,8 +2,13 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libdilithium2_clean.lib
OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as
# long as there has been a C standard, and we need it for constant-time
# computations. Thus, we disable that spurious warning.
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146

all: $(LIBRARY)

@@ -11,7 +16,7 @@ all: $(LIBRARY)
$(OBJECTS): *.h

$(LIBRARY): $(OBJECTS)
LIB.EXE /NOLOGO /WX /OUT:$@ $**
LIB.EXE /NOLOGO /WX /OUT:$@ $**

clean:
-DEL $(OBJECTS)


+ 10
- 17
crypto_sign/dilithium2/clean/api.h Переглянути файл

@@ -4,26 +4,13 @@
#include <stddef.h>
#include <stdint.h>


#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1184U
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2800U
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2044U

#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1312
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2544
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2420
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2"


int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
@@ -33,6 +20,12 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

+ 61
- 101
crypto_sign/dilithium2/clean/ntt.c Переглянути файл

@@ -1,138 +1,98 @@
#include <stdint.h>

#include "params.h"
#include "ntt.h"
#include "poly.h"
#include "params.h"
#include "reduce.h"
#include <stdint.h>

/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM2_CLEAN_ntt */
static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas[N] = {
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347,
2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464,
1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231,
4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, 2706023,
95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, 4519302,
5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, 6681150,
6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, 811944,
531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, 4450022,
6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, 7122806,
1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, 3412210,
7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, 7709315,
7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, 5037034,
264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, 4656075, 8371839,
1653064, 5130689, 2389356, 8169440, 759969, 7063561, 189548, 4827145,
3159746, 6529015, 5971092, 8202977, 1315589, 1341330, 1285669, 6795489,
7567685, 6940675, 5361315, 4499357, 4751448, 3839961, 2091667, 3407706,
2316500, 3817976, 5037939, 2244091, 5933984, 4817955, 266997, 2434439,
7144689, 3513181, 4860065, 4621053, 7183191, 5187039, 900702, 1859098,
909542, 819034, 495491, 6767243, 8337157, 7857917, 7725090, 5257975,
2031748, 3207046, 4823422, 7855319, 7611795, 4784579, 342297, 286988,
5942594, 4108315, 3437287, 5038140, 1735879, 203044, 2842341, 2691481,
5790267, 1265009, 4055324, 1247620, 2486353, 1595974, 4613401, 1250494,
2635921, 4832145, 5386378, 1869119, 1903435, 7329447, 7047359, 1237275,
5062207, 6950192, 7929317, 1312455, 3306115, 6417775, 7100756, 1917081,
5834105, 7005614, 1500165, 777191, 2235880, 3406031, 7838005, 5548557,
6709241, 6533464, 5796124, 4656147, 594136, 4603424, 6366809, 2432395,
2454455, 8215696, 1957272, 3369112, 185531, 7173032, 5196991, 162844,
1616392, 3014001, 810149, 1652634, 4686184, 6581310, 5341501, 3523897,
3866901, 269760, 2213111, 7404533, 1717735, 472078, 7953734, 1723600,
6577327, 1910376, 6712985, 7276084, 8119771, 4546524, 5441381, 6144432,
7959518, 6094090, 183443, 7403526, 1612842, 4834730, 7826001, 3919660,
8332111, 7018208, 3937738, 1400424, 7534263, 1976782
};

/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM2_CLEAN_ntt */
static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[N] = {
6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416,
3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036,
3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683,
7908339, 6662682, 975884, 6167306, 8110657, 4513516, 4856520, 3038916,
1799107, 3694233, 6727783, 7570268, 5366416, 6764025, 8217573, 3183426,
1207385, 8194886, 5011305, 6423145, 164721, 5925962, 5948022, 2013608,
3776993, 7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412,
4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336, 1279661,
1962642, 5074302, 7067962, 451100, 1430225, 3318210, 7143142, 1333058,
1050970, 6476982, 6511298, 2994039, 3548272, 5744496, 7129923, 3767016,
6784443, 5894064, 7132797, 4325093, 7115408, 2590150, 5688936, 5538076,
8177373, 6644538, 3342277, 4943130, 4272102, 2437823, 8093429, 8038120,
3595838, 768622, 525098, 3556995, 5173371, 6348669, 3122442, 655327,
522500, 43260, 1613174, 7884926, 7561383, 7470875, 6521319, 7479715,
3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978, 8113420,
3562462, 2446433, 6136326, 3342478, 4562441, 6063917, 4972711, 6288750,
4540456, 3628969, 3881060, 3019102, 1439742, 812732, 1584928, 7094748,
7039087, 7064828, 177440, 2409325, 1851402, 5220671, 3553272, 8190869,
1316856, 7620448, 210977, 5991061, 3249728, 6727353, 8578, 3724342,
4421799, 7475901, 1100098, 8336129, 5282425, 7871466, 8115473, 3343383,
1430430, 6527646, 7031341, 381987, 1308169, 22981, 1228525, 671102,
2477047, 411027, 3693493, 2967645, 5665122, 6232521, 983419, 4968207,
8253495, 3632928, 3157330, 3190144, 1000202, 4083598, 6441103, 1257611,
1585221, 6203962, 4904467, 1452451, 3041255, 3677745, 1528703, 3930395,
2797779, 6308525, 2556880, 4479693, 4499374, 7426187, 7849063, 7568473,
4680821, 1600420, 2140649, 4873154, 3821735, 4874723, 1643818, 1699267,
539299, 6031717, 300467, 4840449, 2867647, 4805995, 3043716, 3861115,
4464978, 2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394,
8100412, 4369920, 19422, 6623180, 3277672, 1399561, 3859737, 2118186,
2108549, 5760665, 1119584, 549488, 4794489, 1079900, 7356305, 5654953,
5700314, 5268920, 2884855, 5260684, 2091905, 359251, 6026966, 6554070,
7913949, 876248, 777960, 8143293, 518909, 2608894, 8354570
static const int32_t zetas[N] = {
0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103,
2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549,
-2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221,
-1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922,
3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047,
-671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430,
-3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618,
-3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856,
189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330,
1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961,
2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462,
266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378,
900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500,
-655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838,
342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044,
2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974,
-3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970,
-1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642,
-1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031,
-542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993,
-2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385,
-3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107,
-3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078,
-426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893,
-2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687,
-554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782
};

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_ntt
*
* Description: Forward NTT, in-place. No modular reduction is performed after
* additions or subtractions. Hence output coefficients can be up
* to 16*Q larger than the coefficients of the input polynomial.
* Output vector is in bitreversed order.
* additions or subtractions. Output vector is in bitreversed order.
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) {
size_t k, j;
uint32_t zeta, t;
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]) {
unsigned int len, start, j, k;
int32_t zeta, t;

k = 1;
for (size_t len = 128; len > 0; len >>= 1) {
for (size_t start = 0; start < N; start = j + len) {
zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas[k++];
k = 0;
for (len = 128; len > 0; len >>= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas[++k];
for (j = start; j < start + len; ++j) {
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]);
p[j + len] = p[j] + 2 * Q - t;
p[j] = p[j] + t;
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]);
a[j + len] = a[j] - t;
a[j] = a[j] + t;
}
}
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont
*
* Description: Inverse NTT and multiplication by Montgomery factor 2^32.
* In-place. No modular reductions after additions or
* subtractions. Input coefficient need to be smaller than 2*Q.
* Output coefficient are smaller than 2*Q.
* subtractions; input coefficients need to be smaller than
* Q in absolute value. Output coefficient are smaller than Q in
* absolute value.
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) {
size_t start, len, j, k;
uint32_t t, zeta;
const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q;
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]) {
unsigned int start, len, j, k;
int32_t t, zeta;
const int32_t f = 41978; // mont^2/256

k = 0;
k = 256;
for (len = 1; len < N; len <<= 1) {
for (start = 0; start < N; start = j + len) {
zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[k++];
zeta = -zetas[--k];
for (j = start; j < start + len; ++j) {
t = p[j];
p[j] = t + p[j + len];
p[j + len] = t + 256 * Q - p[j + len];
p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]);
t = a[j];
a[j] = t + a[j + len];
a[j + len] = t - a[j + len];
a[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]);
}
}
}

for (j = 0; j < N; ++j) {
p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) f * p[j]);
a[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)f * a[j]);
}
}

+ 3
- 4
crypto_sign/dilithium2/clean/ntt.h Переглянути файл

@@ -1,11 +1,10 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H
#define PQCLEAN_DILITHIUM2_CLEAN_NTT_H
#include "params.h"
#include <stdint.h>

#include "params.h"
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]);

void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]);
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]);
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]);

#endif

+ 108
- 144
crypto_sign/dilithium2/clean/packing.c Переглянути файл

@@ -3,6 +3,7 @@
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk
*
@@ -12,17 +13,18 @@
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(
uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

@@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(
uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk
*
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t key[]: byte array containing key
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
* - const polyveck *t0: pointer to vector t0
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(
uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t key[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < SEEDBYTES; ++i) {
for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < CRHBYTES; ++i) {
for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]);
for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLETA_SIZE_PACKED;
sk += L * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLETA_SIZE_PACKED;
sk += K * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk
*
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t key[]: output byte array for key
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - const polyveck *r0: pointer to output vector t0
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(
uint8_t rho[SEEDBYTES],
uint8_t key[SEEDBYTES],
uint8_t tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) {
for (size_t i = 0; i < SEEDBYTES; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < SEEDBYTES; ++i) {
for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (size_t i = 0; i < CRHBYTES; ++i) {
for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED);
for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLETA_SIZE_PACKED;
sk += L * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLETA_SIZE_PACKED;
sk += K * POLYETA_PACKEDBYTES;

for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig
*
* Description: Bit-pack signature sig = (z, h, c).
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
* - const poly *c: pointer to challenge polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(
uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const polyvecl *z,
const polyveck *h,
const poly *c) {
size_t k;
uint64_t signs, mask;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += L * POLZ_SIZE_PACKED;
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (size_t i = 0; i < K; ++i) {
for (size_t j = 0; j < N; ++j) {
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t)j;
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t)k;
}
while (k < OMEGA) {
sig[k++] = 0;
}
sig += OMEGA + K;

/* Encode c */
signs = 0;
mask = 1;
for (size_t i = 0; i < N / 8; ++i) {
sig[i] = 0;
for (size_t j = 0; j < 8; ++j) {
if (c->coeffs[8 * i + j] != 0) {
sig[i] |= (uint8_t)(1u << j);
if (c->coeffs[8 * i + j] == (Q - 1)) {
signs |= mask;
}
mask <<= 1;
}
}
}
sig += N / 8;
for (size_t i = 0; i < 8; ++i) {
sig[i] = (uint8_t)(signs >> 8u * i);
sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig
*
* Description: Unpack signature sig = (z, h, c).
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - polyvecl *z: pointer to output vector z
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - poly *c: pointer to output challenge polynomial
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) {
size_t k;
uint64_t signs;

for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED);
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += L * POLZ_SIZE_PACKED;
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (size_t i = 0; i < K; ++i) {
for (size_t j = 0; j < N; ++j) {
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

@@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(
return 1;
}

for (size_t j = k; j < sig[OMEGA + i]; ++j) {
for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
@@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(
}

/* Extra indices are zero for strong unforgeability */
for (size_t j = k; j < OMEGA; ++j) {
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

sig += OMEGA + K;

/* Decode c */
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

signs = 0;
for (size_t i = 0; i < 8; ++i) {
signs |= (uint64_t)sig[N / 8 + i] << 8 * i;
}

/* Extra sign bits are zero for strong unforgeability */
if (signs >> 60) {
return 1;
}

for (size_t i = 0; i < N / 8; ++i) {
for (size_t j = 0; j < 8; ++j) {
if ((sig[i] >> j) & 0x01) {
c->coeffs[8 * i + j] = 1;
c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
}
}

return 0;
}

+ 23
- 34
crypto_sign/dilithium2/clean/packing.h Переглянути файл

@@ -1,42 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H
#define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H

#include "api.h"
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(
uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(
uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t key[SEEDBYTES],
const uint8_t tr[SEEDBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(
uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES],
const polyvecl *z, const polyveck *h, const poly *c);
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(
uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]);
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(
uint8_t rho[SEEDBYTES],
uint8_t key[SEEDBYTES],
uint8_t tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk);
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]);
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]);

#endif

+ 26
- 14
crypto_sign/dilithium2/clean/params.h Переглянути файл

@@ -2,28 +2,40 @@
#define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define QBITS 23
#define D 14
#define GAMMA1 ((Q - 1)/16)
#define GAMMA2 (GAMMA1/2)
#define ALPHA (2*GAMMA2)
#define D 13
#define ROOT_OF_UNITY 1753

#define K 4
#define L 3
#define ETA 6
#define SETABITS 4
#define BETA 325
#define L 4
#define ETA 2
#define TAU 39
#define BETA 78
#define GAMMA1 (1 << 17)
#define GAMMA2 ((Q-1)/88)
#define OMEGA 80
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 576

#define POLYW1_PACKEDBYTES 192

#define POLYETA_PACKEDBYTES 96

#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8)
#define POLT0_SIZE_PACKED ((N*D)/8)
#define POLETA_SIZE_PACKED ((N*SETABITS)/8)
#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8)
#define POLW1_SIZE_PACKED ((N*4)/8)
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 456
- 293
crypto_sign/dilithium2/clean/poly.c
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 24
- 37
crypto_sign/dilithium2/clean/poly.h Переглянути файл

@@ -1,53 +1,40 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H
#define PQCLEAN_DILITHIUM2_CLEAN_POLY_H

#include <stddef.h>
#include <stdint.h>

#include "params.h"
#include <stdint.h>

typedef struct {
uint32_t coeffs[N];
int32_t coeffs[N];
} poly;

void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM2_CLEAN_poly_add(
poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(
poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(
poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(
poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(
poly *a1, poly *a0, const poly *a);
uint32_t PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(
poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(
poly *a, const poly *b, const poly *h);

int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(
const poly *a, uint32_t B);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(
poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(
poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(
poly *a,
const uint8_t seed[CRHBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a,
const uint8_t seed[CRHBYTES],
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a);


+ 191
- 79
crypto_sign/dilithium2/clean/polyvec.c Переглянути файл

@@ -1,14 +1,65 @@
#include <stddef.h>
#include <stdint.h>

#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
unsigned int i, j;

for (i = 0; i < K; ++i) {
for (j = 0; j < L; ++j) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j));
}
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i));
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze
*
@@ -18,7 +69,9 @@
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) {
for (size_t i = 0; i < L; ++i) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]);
}
}
@@ -33,9 +86,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) {
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(
polyvecl *w, const polyvecl *u, const polyvecl *v) {
for (size_t i = 0; i < L; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}
@@ -49,32 +103,49 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) {
for (size_t i = 0; i < L; ++i) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
* Input coefficients are assumed to be less than 22*Q. Output
* coeffcient are less than 2*L*Q.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(
poly *w, const polyvecl *u, const polyvecl *v) {
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v) {
unsigned int i;
poly t;

PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(w, &u->vec[0], &v->vec[0]);

for (size_t i = 1; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&t, &u->vec[i], &v->vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
for (i = 1; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_add(w, w, &t);
}
}
@@ -83,17 +154,19 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input coefficients to be standard representatives.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - uint32_t B: norm bound
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B and 1
* otherwise.
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) {
for (size_t i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) {
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}
@@ -105,32 +178,43 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) {
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [0,2*Q[.
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* subtract Q if coefficient is bigger than Q.
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(&v->vec[i]);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(&v->vec[i]);
}
}

@@ -143,7 +227,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) {
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]);
}
}
@@ -158,9 +244,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) {
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(
polyveck *w, const polyveck *u, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}
@@ -169,17 +256,17 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* Assumes coefficients of polynomials in second input vector
* to be less than 2*Q. No modular reduction is performed.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(
polyveck *w, const polyveck *u, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}
@@ -188,12 +275,14 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{32-D}.
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(&v->vec[i]);
}
}
@@ -207,13 +296,15 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) {
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
@@ -221,27 +312,40 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) {
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) {
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&v->vec[i]);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input coefficients to be standard representatives.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - uint32_t B: norm bound
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B and 1
* otherwise.
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) {
for (size_t i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) {
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}
@@ -253,19 +357,20 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) {
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod Q = a1*2^D + a0
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(
polyveck *v1, polyveck *v0, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}
@@ -274,7 +379,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
@@ -282,12 +387,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(
polyveck *v1, polyveck *v0, const polyveck *v) {
for (size_t i = 0; i < K; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}
@@ -303,15 +409,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(
*
* Returns number of 1 bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(
polyveck *h,
const polyveck *v0,
const polyveck *v1) {
uint32_t s = 0;

for (size_t i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(
&h->vec[i], &v0->vec[i], &v1->vec[i]);
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1) {
unsigned int i, s = 0;

for (i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
}

return s;
@@ -324,13 +428,21 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *v: pointer to input vector
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(
polyveck *w, const polyveck *v, const polyveck *h) {
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(
&w->vec[i], &v->vec[i], &h->vec[i]);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 36
- 26
crypto_sign/dilithium2/clean/polyvec.h Переглянути файл

@@ -1,25 +1,33 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H
#define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H

#include <stdint.h>

#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(
poly *w, const polyvecl *u, const polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B);

int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B);



@@ -28,31 +36,33 @@ typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(
polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(
polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v);

int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(
const polyveck *v, uint32_t B);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(
polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(
polyveck *v1, polyveck *v0, const polyveck *v);
uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(
polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(
polyveck *w, const polyveck *v, const polyveck *h);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);

void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 25
- 31
crypto_sign/dilithium2/clean/reduce.c Переглянути файл

@@ -1,60 +1,54 @@
#include <stdint.h>

#include "params.h"
#include "reduce.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce
*
* Description: For finite field element a with 0 <= a <= Q*2^32,
* compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q.
* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
*
* Arguments: - uint64_t: finite field element a
* Arguments: - int64_t: finite field element a
*
* Returns r.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a) {
uint64_t t;
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a) {
int32_t t;

t = a * QINV;
t &= (1ULL << 32) - 1;
t *= Q;
t = a + t;
t >>= 32;
return (uint32_t)t;
t = (int32_t)((uint64_t)a * (uint64_t)QINV);
t = (a - (int64_t)t * Q) >> 32;
return t;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32
*
* Description: For finite field element a, compute r \equiv a (mod Q)
* such that 0 <= r < 2*Q.
* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007.
*
* Arguments: - uint32_t: finite field element a
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a) {
uint32_t t;
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a) {
int32_t t;

t = a & 0x7FFFFF;
a >>= 23;
t += (a << 13) - a;
t = (a + (1 << 22)) >> 23;
t = a - t * Q;
return t;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_csubq
* Name: PQCLEAN_DILITHIUM2_CLEAN_caddq
*
* Description: Subtract Q if input coefficient is bigger than Q.
* Description: Add Q if input coefficient is negative.
*
* Arguments: - uint32_t: finite field element a
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) {
a -= Q;
a += ((int32_t)a >> 31) & Q;
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a) {
a += (a >> 31) & Q;
return a;
}

@@ -62,14 +56,14 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) {
* Name: PQCLEAN_DILITHIUM2_CLEAN_freeze
*
* Description: For finite field element a, compute standard
* representative r = a mod Q.
* representative r = a mod^+ Q.
*
* Arguments: - uint32_t: finite field element a
* Arguments: - int32_t: finite field element a
*
* Returns r.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(uint32_t a) {
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a) {
a = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a);
a = PQCLEAN_DILITHIUM2_CLEAN_csubq(a);
a = PQCLEAN_DILITHIUM2_CLEAN_caddq(a);
return a;
}

+ 7
- 11
crypto_sign/dilithium2/clean/reduce.h Переглянути файл

@@ -1,21 +1,17 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H
#define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H
#include "params.h"
#include <stdint.h>

#define MONT 4193792U // 2^32 % Q
#define QINV 4236238847U // -q^(-1) mod 2^32
#define MONT (-4186625) // 2^32 % Q
#define QINV 58728449 // q^(-1) mod 2^32

/* a <= Q*2^32 => r < 2*Q */
uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a);
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a);

/* r < 2*Q */
uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a);
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a);

/* a < 2*Q => r < Q */
uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a);
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a);

/* r < Q */
uint32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(uint32_t a);
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a);

#endif

+ 43
- 62
crypto_sign/dilithium2/clean/rounding.c Переглянути файл

@@ -1,86 +1,70 @@
#include "params.h"
#include "rounding.h"
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be standard representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
* Arguments: - int32_t a: input element
* - int32_t *a0: pointer to output element a0
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) {
uint32_t t;
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a) {
int32_t a1;

/* Centralized remainder mod 2^D */
t = a & ((1U << D) - 1);
t -= (1U << (D - 1)) + 1;
t += ((uint32_t)((int32_t)t >> 31) & (1 << D));
t -= (1U << (D - 1)) - 1;
*a0 = Q + t;
a = (a - t) >> D;
return a;
a1 = (a + (1 << (D - 1)) - 1) >> D;
*a0 = a - (a1 << D);
return a1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard
* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
* representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
* Arguments: - int32_t a: input element
* - int32_t *a0: pointer to output element a0
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) {
int32_t t, u;
int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) {
int32_t a1;

/* Centralized remainder mod ALPHA */
t = a & 0x7FFFFu;
t += (int32_t)((a >> 19u) << 9u);
t -= ALPHA / 2 + 1;
t += (t >> 31) & ALPHA;
t -= ALPHA / 2 - 1;
a -= (uint32_t)t;
a1 = (a + 127) >> 7;
a1 = (a1 * 11275 + (1 << 23)) >> 24;
a1 ^= ((43 - a1) >> 31) & a1;

/* Divide by ALPHA (possible to avoid) */
u = (int32_t)(a - 1);
u >>= 31;
a = (a >> 19) + 1;
a -= u & 1;

/* Border case */
*a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u));
a &= 0xFu;
return a;
*a0 = a - a1 * 2 * GAMMA2;
*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q;
return a1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits. Inputs assumed to be
* standard representatives.
* input element overflow into the high bits.
*
* Arguments: - uint32_t a0: low bits of input element
* - uint32_t a1: high bits of input element
* Arguments: - int32_t a0: low bits of input element
* - int32_t a1: high bits of input element
*
* Returns 1 if high bits of a and b differ and 0 otherwise.
* Returns 1 if overflow.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) {
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) {
return 0;
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1) {
if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) {
return 1;
}

return 1;
return 0;
}

/*************************************************
@@ -88,30 +72,27 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) {
*
* Description: Correct high bits according to hint.
*
* Arguments: - uint32_t a: input element
* Arguments: - int32_t a: input element
* - unsigned int hint: hint bit
*
* Returns corrected high bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) {
uint32_t a0, a1;
int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint) {
int32_t a0, a1;

a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(a, &a0);
a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0, a);
if (hint == 0) {
return a1;
}
if (a0 > Q) {
return (a1 + 1) & 0xF;
}

return (a1 - 1) & 0xF;

/* If PQCLEAN_DILITHIUM2_CLEAN_decompose does not divide out ALPHA:
if(hint == 0)
return a1;
else if(a0 > Q)
return (a1 + ALPHA) % (Q - 1);
else
return (a1 - ALPHA) % (Q - 1);
*/
if (a0 > 0) {
if (a1 == 43) {
return 0;
}
return a1 + 1;
}
if (a1 == 0) {
return 43;
}
return a1 - 1;
}

+ 8
- 5
crypto_sign/dilithium2/clean/rounding.h Переглянути файл

@@ -1,11 +1,14 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H
#define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H
#include "params.h"
#include <stdint.h>

uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0);
uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0);
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1);
uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint);
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a);

int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a);

unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1);

int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint);

#endif

+ 138
- 222
crypto_sign/dilithium2/clean/sign.c Переглянути файл

@@ -1,6 +1,3 @@
#include <stdint.h>
#include <string.h>

#include "fips202.h"
#include "packing.h"
#include "params.h"
@@ -9,84 +6,7 @@
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|i|j).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
for (size_t i = 0; i < K; ++i) {
for (size_t j = 0; j < L; ++j) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t)((i << 8) + j));
}
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge
*
* Description: Implementation of H. Samples polynomial with 60 nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(mu|w1).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing mu
* - const polyveck *w1: pointer to vector w1
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c,
const uint8_t mu[CRHBYTES],
const polyveck *w1) {
uint64_t signs;
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
uint8_t outbuf[SHAKE256_RATE];
shake256ctx state;
uint8_t b;
size_t pos;

for (size_t i = 0; i < CRHBYTES; ++i) {
inbuf[i] = mu[i];
}
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]);
}

shake256_absorb(&state, inbuf, sizeof(inbuf));
shake256_squeezeblocks(outbuf, 1, &state);

signs = 0;
for (size_t i = 0; i < 8; ++i) {
signs |= (uint64_t)outbuf[i] << 8 * i;
}

pos = 8;

for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

for (size_t i = 196; i < 256; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_squeezeblocks(outbuf, 1, &state);
pos = 0;
}

b = outbuf[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1;
c->coeffs[b] ^= -((int32_t)signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
shake256_ctx_release(&state);
}
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair
@@ -94,9 +14,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c,
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes)
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes)
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
@@ -104,48 +24,42 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
uint16_t nonce = 0;
polyvecl mat[K];
polyvecl s1, s1hat;
polyveck s2, t, t1, t0;
polyveck s2, t1, t0;

/* Expand 32 bytes of randomness into rho, rhoprime and key */
randombytes(seedbuf, 3 * SEEDBYTES);
/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Expand matrix */
PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho);

/* Sample short vectors s1 and s2 */
for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&s1.vec[i], rhoprime, nonce++);
}
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&s2.vec[i], rhoprime, nonce++);
}
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L);

/* Matrix-vector multiplication */
s1hat = s1;
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1hat);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat);
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&t.vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&t.vec[i]);
}
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&t1);

/* Add error vector s2 */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t, &t, &s2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t1, &t1, &s2);

/* Extract t1 and write public key */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(&t);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0);
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2);

return 0;
}
@@ -153,44 +67,41 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature
*
* Description: Compute signed message.
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES
* of len)
* - size_t *smlen: pointer to output length of signed message
* (should be PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES)
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *msg, size_t mlen,
const uint8_t *sk) {
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig,
size_t *siglen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
unsigned int n;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint32_t n;
uint16_t nonce = 0;
poly c, chat;
polyvecl mat[K], s1, y, yhat, z;
polyveck t0, s2, w, w1, w0;
polyveck h, cs2, ct0;
polyvecl mat[K], s1, y, z;
polyveck t0, s2, w1, w0, h;
poly cp;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

// use incremental hash API instead of copying around buffers
/* Compute CRH(tr, msg) */
shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, msg, mlen);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);
@@ -198,76 +109,71 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t0);

rej:
/* Sample intermediate vector y */
for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(&y.vec[i], rhoprime, nonce++);
}
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++);

/* Matrix-vector multiplication */
yhat = y;
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&yhat);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat);
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&w.vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&w.vec[i]);
}
z = y;
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&w);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w);
PQCLEAN_DILITHIUM2_CLEAN_challenge(&c, mu, &w1);
chat = c;
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&chat);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(sig, &w1);

/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&cs2.vec[i]);
}
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &cs2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(&w0);
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, sig);
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp);

/* Compute z, reject if it reveals secret */
for (size_t i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&z.vec[i]);
}
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(&z);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(&z, &z, &y);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(&z);
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(&z);
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
goto rej;
}

/* Compute hints for w1 */
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&ct0.vec[i]);
/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w0);
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}

PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&ct0);
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&ct0, GAMMA2)) {
/* Compute hints for w1 */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&h);
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&h, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &ct0);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&w0);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &h);
n = PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(&h, &w0, &w1);
if (n > OMEGA) {
goto rej;
}

/* Write signature */
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, &z, &h, &c);
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, sig, &z, &h);
*siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;
return 0;
}
@@ -281,53 +187,63 @@ rej:
* array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
int rc;
memmove(sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, m, mlen);
rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk);
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm,
size_t *smlen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
size_t i;

for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return rc;
return 0;
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify
*
* Description: Verify signed message.
* Description: Verifies signature.
*
* Arguments: - uint8_t *sig: signature
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES)
* - uint8_t *m: pointer to message
* - size_t *mlen: pointer to output length of message
* - uint8_t *pk: pointer to bit-packed public key
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk) {
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig,
size_t siglen,
const uint8_t *m,
size_t mlen,
const uint8_t *pk) {
unsigned int i;
uint8_t buf[K * POLYW1_PACKEDBYTES];
uint8_t rho[SEEDBYTES];
uint8_t mu[CRHBYTES];
poly c, chat, cp;
uint8_t c[SEEDBYTES];
uint8_t c2[SEEDBYTES];
poly cp;
polyvecl mat[K], z;
polyveck t1, w1, h, tmp1, tmp2;
polyveck t1, w1, h;
shake256incctx state;

if (siglen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
if (siglen != PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
return -1;
}

PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(rho, &t1, pk);
if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(&z, &h, &c, sig)) {
if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(c, &z, &h, sig)) {
return -1;
}
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
@@ -336,8 +252,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES);

shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
@@ -346,38 +260,39 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
shake256_inc_ctx_release(&state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, c);
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho);

PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z);
for (size_t i = 0; i < K ; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z);
}
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);

chat = c;
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&chat);
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(&t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t1);
for (size_t i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]);
}
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);

PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&tmp1, &tmp1, &tmp2);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&tmp1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(&tmp1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w1, &w1, &t1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1);

/* Reconstruct w1 */
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&tmp1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &tmp1, &h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(buf, &w1);

/* Call random oracle and verify challenge */
PQCLEAN_DILITHIUM2_CLEAN_challenge(&cp, mu, &w1);
for (size_t i = 0; i < N; ++i) {
if (c.coeffs[i] != cp.coeffs[i]) {
/* Call random oracle and verify PQCLEAN_DILITHIUM2_CLEAN_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(c2, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (c[i] != c2[i]) {
return -1;
}
}

// All good
return 0;
}

@@ -387,7 +302,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
@@ -395,33 +310,34 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk) {
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m,
size_t *mlen,
const uint8_t *sm,
size_t smlen,
const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
goto badsig;
}
*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;

if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES,
sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) {
*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (size_t i = 0; i < *mlen; ++i) {
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i];
}
return 0;
}

/* Signature verification failed */
badsig:
/* Signature verification failed */
*mlen = (size_t) -1;
for (size_t i = 0; i < smlen; ++i) {
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}


+ 22
- 5
crypto_sign/dilithium2/clean/sign.h Переглянути файл

@@ -1,12 +1,29 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H
#define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H

#include "api.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES],
const polyveck *w1);
#endif

+ 0
- 26
crypto_sign/dilithium2/clean/stream.c Переглянути файл

@@ -1,26 +0,0 @@
#include "stream.h"

#include <string.h>

void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {

uint8_t buf[SEEDBYTES + 2];
memcpy(buf, seed, SEEDBYTES);
buf[SEEDBYTES] = (uint8_t)nonce;
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8);

shake128_absorb(state, buf, SEEDBYTES + 2);
}


void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {

uint8_t buf[CRHBYTES + 2];
memcpy(buf, seed, CRHBYTES);
buf[CRHBYTES] = (uint8_t)nonce;
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8);

shake256_absorb(state, buf, CRHBYTES + 2);
}

+ 0
- 15
crypto_sign/dilithium2/clean/stream.h Переглянути файл

@@ -1,15 +0,0 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_STREAM_H
#define PQCLEAN_DILITHIUM2_CLEAN_STREAM_H

#include <stdint.h>

#include "fips202.h"
#include "params.h"

void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce);

#endif

+ 26
- 0
crypto_sign/dilithium2/clean/symmetric-shake.c Переглянути файл

@@ -0,0 +1,26 @@
#include "fips202.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake128_inc_init(state);
shake128_inc_absorb(state, seed, SEEDBYTES);
shake128_inc_absorb(state, t, 2);
shake128_inc_finalize(state);
}

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {
uint8_t t[2];
t[0] = (uint8_t) nonce;
t[1] = (uint8_t) (nonce >> 8);

shake256_inc_init(state);
shake256_inc_absorb(state, seed, CRHBYTES);
shake256_inc_absorb(state, t, 2);
shake256_inc_finalize(state);
}

+ 23
- 12
crypto_sign/dilithium2/clean/symmetric.h Переглянути файл

@@ -1,25 +1,36 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H
#include "fips202.h"
#include "params.h"
#include "stream.h"
#include <stdint.h>


#include "fips202.h"

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream128_ctx_release(STATE) shake128_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream256_ctx_release(STATE) shake256_ctx_release(STATE)
typedef shake128incctx stream128_state;
typedef shake256incctx stream256_state;

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state,
const uint8_t seed[SEEDBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state,
const uint8_t seed[CRHBYTES],
uint16_t nonce);

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

typedef shake128ctx stream128_state;
typedef shake256ctx stream256_state;
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE)
#define stream128_release(STATE) shake128_inc_ctx_release(STATE)
#define stream256_init(STATE, SEED, NONCE) \
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE)
#define stream256_release(STATE) shake256_inc_ctx_release(STATE)


#endif

+ 31
- 0
crypto_sign/dilithium2aes/META.yml Переглянути файл

@@ -0,0 +1,31 @@
name: Dilithium2-AES
type: signature
claimed-nist-level: 2
length-public-key: 1312
length-secret-key: 2544
length-signature: 2420
nistkat-sha256: 23972a0a5f1f32781aa11fa57d9994ddd53c1bbcc732967f61d9d9aaef01c492
testvectors-sha256: 22e68fe8bf781dee949a4297f9ba44d1c350a1d88bae03117cfb2ca494c6e604
principal-submitters:
- Vadim Lyubashevsky
auxiliary-submitters:
- Léo Ducas
- Eike Kiltz
- Tancrède Lepoint
- Peter Schwabe
- Gregor Seiler
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
- name: avx2
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- aes
- avx2
- popcnt

+ 5
- 0
crypto_sign/dilithium2aes/avx2/LICENSE Переглянути файл

@@ -0,0 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 23
- 0
crypto_sign/dilithium2aes/avx2/Makefile Переглянути файл

@@ -0,0 +1,23 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libdilithium2aes_avx2.a
HEADERS=aes256ctr.h align.h api.h cdecl.h consts.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc
OBJECTS=aes256ctr.o consts.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o invntt.o ntt.o pointwise.o shuffle.o
CFLAGS=-mavx2 -maes -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \
-Wmissing-prototypes -Wredundant-decls -std=c99 \
-I../../../common $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.S $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

+ 142
- 0
crypto_sign/dilithium2aes/avx2/aes256ctr.c Переглянути файл

@@ -0,0 +1,142 @@
#include "aes256ctr.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
/* Based heavily on public-domain code by Romain Dolbeau
* Different handling of nonce+counter than original version using
* separated 64-bit nonce and internal 64-bit counter, starting from zero
* Public Domain */


static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) {
__m128i f, f0, f1, f2, f3;
const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);

/* Load current counter value */
f = _mm_load_si128(n);

/* Increase counter in 4 consecutive blocks */
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx);
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx);
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx);
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx);

/* Write counter for next iteration, increased by 4 */
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0)));

/* Actual AES encryption, 4x interleaved */
f = _mm_load_si128(&rkeys[0]);
f0 = _mm_xor_si128(f0, f);
f1 = _mm_xor_si128(f1, f);
f2 = _mm_xor_si128(f2, f);
f3 = _mm_xor_si128(f3, f);

for (int i = 1; i < 14; i++) {
f = _mm_load_si128(&rkeys[i]);
f0 = _mm_aesenc_si128(f0, f);
f1 = _mm_aesenc_si128(f1, f);
f2 = _mm_aesenc_si128(f2, f);
f3 = _mm_aesenc_si128(f3, f);
}

f = _mm_load_si128(&rkeys[14]);
f0 = _mm_aesenclast_si128(f0, f);
f1 = _mm_aesenclast_si128(f1, f);
f2 = _mm_aesenclast_si128(f2, f);
f3 = _mm_aesenclast_si128(f3, f);

/* Write results */
_mm_storeu_si128((__m128i *)(out + 0), f0);
_mm_storeu_si128((__m128i *)(out + 16), f1);
_mm_storeu_si128((__m128i *)(out + 32), f2);
_mm_storeu_si128((__m128i *)(out + 48), f3);
}

void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) {
__m128i key0, key1, temp0, temp1, temp2, temp4;
int idx = 0;

key0 = _mm_loadu_si128((__m128i *)(key + 0));
key1 = _mm_loadu_si128((__m128i *)(key + 16));
state->n = _mm_loadl_epi64((__m128i *)&nonce);

state->rkeys[idx++] = key0;
temp0 = key0;
temp2 = key1;
temp4 = _mm_setzero_si128();

#define BLOCK1(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
state->rkeys[idx++] = temp2; \
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x10); \
temp0 = _mm_xor_si128(temp0, temp4); \
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x8c); \
temp0 = _mm_xor_si128(temp0, temp4); \
temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xff); \
temp0 = _mm_xor_si128(temp0, temp1)

#define BLOCK2(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
state->rkeys[idx++] = temp0; \
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x10); \
temp2 = _mm_xor_si128(temp2, temp4); \
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x8c); \
temp2 = _mm_xor_si128(temp2, temp4); \
temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xaa); \
temp2 = _mm_xor_si128(temp2, temp1)

BLOCK1(0x01);
BLOCK2(0x01);

BLOCK1(0x02);
BLOCK2(0x02);

BLOCK1(0x04);
BLOCK2(0x04);

BLOCK1(0x08);
BLOCK2(0x08);

BLOCK1(0x10);
BLOCK2(0x10);

BLOCK1(0x20);
BLOCK2(0x20);

BLOCK1(0x40);
state->rkeys[idx++] = temp0;
}

void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out,
size_t nblocks,
aes256ctr_ctx *state) {
size_t i;
for (i = 0; i < nblocks; i++) {
aesni_encrypt4(out, &state->n, state->rkeys);
out += 64;
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_prf(uint8_t *out,
size_t outlen,
const uint8_t seed[32],
uint64_t nonce) {
unsigned int i;
uint8_t buf[64];
aes256ctr_ctx state;

PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&state, seed, nonce);

while (outlen >= 64) {
aesni_encrypt4(out, &state.n, state.rkeys);
outlen -= 64;
out += 64;
}

if (outlen) {
aesni_encrypt4(buf, &state.n, state.rkeys);
for (i = 0; i < outlen; i++) {
out[i] = buf[i];
}
}
}

+ 29
- 0
crypto_sign/dilithium2aes/avx2/aes256ctr.h Переглянути файл

@@ -0,0 +1,29 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_AES256CTR_H
#define PQCLEAN_DILITHIUM2AES_AVX2_AES256CTR_H

#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>


#define AES256CTR_BLOCKBYTES 64

typedef struct {
__m128i rkeys[16];
__m128i n;
} aes256ctr_ctx;

void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(aes256ctr_ctx *state,
const uint8_t key[32],
uint64_t nonce);

void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out,
size_t nblocks,
aes256ctr_ctx *state);

void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_prf(uint8_t *out,
size_t outlen,
const uint8_t seed[32],
uint64_t nonce);

#endif

+ 19
- 0
crypto_sign/dilithium2aes/avx2/align.h Переглянути файл

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_ALIGN_H
#define PQCLEAN_DILITHIUM2AES_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[N]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGNED_INT32(N) \
union { \
int32_t coeffs[N]; \
__m256i vec[((N)+7)/8]; \
}

#endif

+ 31
- 0
crypto_sign/dilithium2aes/avx2/api.h Переглянути файл

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_API_H
#define PQCLEAN_DILITHIUM2AES_AVX2_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES 1312
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES 2544
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES 2420
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_ALGNAME "Dilithium2-AES"


int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen, const uint8_t *pk);

#endif

+ 24
- 0
crypto_sign/dilithium2aes/avx2/cdecl.h Переглянути файл

@@ -0,0 +1,24 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_CDECL_H
#define PQCLEAN_DILITHIUM2AES_AVX2_CDECL_H



#define _8XQ 0
#define _8XQINV 8
#define _8XDIV_QINV 16
#define _8XDIV 24
#define _ZETAS_QINV 32
#define _ZETAS 328

/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/

#define _cdecl(s) _##s
#define cdecl(s) s

#endif

+ 101
- 0
crypto_sign/dilithium2aes/avx2/consts.c Переглянути файл

@@ -0,0 +1,101 @@
#include "consts.h"
#include "params.h"
#include <stdint.h>

#define QINV 58728449 // q^(-1) mod 2^32
#define MONT (-4186625) // 2^32 mod q
#define DIV 41978 // mont^2/256
#define DIV_QINV (-8395782)

const qdata_t PQCLEAN_DILITHIUM2AES_AVX2_qdata = {{
//#define _8XQ 0
Q, Q, Q, Q, Q, Q, Q, Q,

//#define _8XQINV 8
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

//#define _8XDIV_QINV 16
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV,

//#define _8XDIV 24
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV,

//#define _ZETAS_QINV 32
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244,
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077,
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561,
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417,
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735,
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904,
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771,
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600,
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139,
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433,
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547,
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852,
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995,
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424,
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315,
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951,
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031,
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878,
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606,
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568,
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583,
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093,
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172,
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187,
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462,
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722,
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279,
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342,
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272,
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682,
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363,
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473,
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426,
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762,
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494,
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853,
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238,

//#define _ZETAS 328
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468,
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451,
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905,
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855,
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103,
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928,
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549,
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672,
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005,
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439,
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299,
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596,
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779,
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928,
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771,
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969,
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969,
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922,
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430,
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856,
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961,
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995,
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100,
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149,
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738,
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098,
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455,
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634,
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424,
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622,
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115,
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233,
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154,
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838,
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642,
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107,
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782,
}
};

+ 10
- 0
crypto_sign/dilithium2aes/avx2/consts.h Переглянути файл

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_CONSTS_H
#define PQCLEAN_DILITHIUM2AES_AVX2_CONSTS_H
#include "align.h"
#include "cdecl.h"


typedef ALIGNED_INT32(624) qdata_t;
extern const qdata_t PQCLEAN_DILITHIUM2AES_AVX2_qdata;

#endif

+ 240
- 0
crypto_sign/dilithium2aes/avx2/invntt.S Переглянути файл

@@ -0,0 +1,240 @@
#include "cdecl.h"
.include "shuffle.inc"

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpsubd %ymm\l,%ymm\h,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vpmuldq %ymm\zl0,%ymm12,%ymm13
vmovshdup %ymm12,%ymm\h
vpmuldq %ymm\zl1,%ymm\h,%ymm14

vpmuldq %ymm\zh0,%ymm12,%ymm12
vpmuldq %ymm\zh1,%ymm\h,%ymm\h

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vpsubd %ymm13,%ymm12,%ymm12
vpsubd %ymm14,%ymm\h,%ymm\h

vmovshdup %ymm12,%ymm12
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h
.endm

.macro levels0t5 off
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

/* level 0 */
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,5,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 6,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,9,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 10,11,1,3,2,15

/* level 1 */
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,6,1,3,2,15
butterfly 5,7,1,3,2,15

vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,10,1,3,2,15
butterfly 9,11,1,3,2,15

/* level 2 */
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,8,1,3,2,15
butterfly 5,9,1,3,2,15
butterfly 6,10,1,3,2,15
butterfly 7,11,1,3,2,15

/* level 3 */
shuffle2 4,5,3,5
shuffle2 6,7,4,7
shuffle2 8,9,6,9
shuffle2 10,11,8,11

vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2
butterfly 3,5
butterfly 4,7
butterfly 6,9
butterfly 8,11

/* level 4 */
shuffle4 3,4,10,4
shuffle4 6,8,3,8
shuffle4 5,7,6,7
shuffle4 9,11,5,11

vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2
butterfly 10,4
butterfly 3,8
butterfly 6,7
butterfly 5,11

/* level 5 */
shuffle8 10,3,9,3
shuffle8 6,5,10,5
shuffle8 4,8,6,8
shuffle8 7,11,4,11

vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2
butterfly 9,3
butterfly 10,5
butterfly 6,8
butterfly 4,11

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm10,256*\off+ 32(%rdi)
vmovdqa %ymm6,256*\off+ 64(%rdi)
vmovdqa %ymm4,256*\off+ 96(%rdi)
vmovdqa %ymm3,256*\off+128(%rdi)
vmovdqa %ymm5,256*\off+160(%rdi)
vmovdqa %ymm8,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

.macro levels6t7 off
vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

/* level 6 */
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

/* level 7 */
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)

vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1
vmovdqa (_8XDIV)*4(%rsi),%ymm2
vpmuldq %ymm1,%ymm4,%ymm12
vpmuldq %ymm1,%ymm5,%ymm13
vmovshdup %ymm4,%ymm8
vmovshdup %ymm5,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm4,%ymm4
vpmuldq %ymm2,%ymm5,%ymm5
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm4,%ymm4
vpsubd %ymm13,%ymm5,%ymm5
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm4,%ymm4
vmovshdup %ymm5,%ymm5
vpblendd $0xAA,%ymm8,%ymm4,%ymm4
vpblendd $0xAA,%ymm9,%ymm5,%ymm5

vpmuldq %ymm1,%ymm6,%ymm12
vpmuldq %ymm1,%ymm7,%ymm13
vmovshdup %ymm6,%ymm8
vmovshdup %ymm7,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm6,%ymm6
vpmuldq %ymm2,%ymm7,%ymm7
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm6,%ymm6
vpsubd %ymm13,%ymm7,%ymm7
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm6,%ymm6
vmovshdup %ymm7,%ymm7
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
vpblendd $0xAA,%ymm9,%ymm7,%ymm7

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
.endm

.text
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx)
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx):
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t5 0
levels0t5 1
levels0t5 2
levels0t5 3

levels6t7 0
levels6t7 1
levels6t7 2
levels6t7 3

ret

+ 199
- 0
crypto_sign/dilithium2aes/avx2/ntt.S Переглянути файл

@@ -0,0 +1,199 @@
#include "cdecl.h"
.include "shuffle.inc"

.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpmuldq %ymm\zl0,%ymm\h,%ymm13
vmovshdup %ymm\h,%ymm12
vpmuldq %ymm\zl1,%ymm12,%ymm14

vpmuldq %ymm\zh0,%ymm\h,%ymm\h
vpmuldq %ymm\zh1,%ymm12,%ymm12

vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vmovshdup %ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h

vpsubd %ymm\h,%ymm\l,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l

vmovshdup %ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13

vpaddd %ymm13,%ymm12,%ymm\h
vpsubd %ymm13,%ymm\l,%ymm\l
.endm

.macro levels0t1 off
/* level 0 */
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2

vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

/* level 1 */
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7

vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)
.endm

.macro levels2t7 off
/* level 2 */
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

/* level 3 */
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2

butterfly 3,5
butterfly 8,10
butterfly 4,6
butterfly 9,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

/* level 4 */
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2

butterfly 7,8
butterfly 5,6
butterfly 3,4
butterfly 10,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

/* level 5 */
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15

butterfly 9,5,1,10,2,15
butterfly 8,4,1,10,2,15
butterfly 7,3,1,10,2,15
butterfly 6,11,1,10,2,15

/* level 6 */
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,7,1,10,2,15
butterfly 8,6,1,10,2,15

vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,3,1,10,2,15
butterfly 4,11,1,10,2,15

/* level 7 */
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,8,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 7,6,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,4,1,10,2,15

vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 3,11,1,10,2,15

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm8,256*\off+ 32(%rdi)
vmovdqa %ymm7,256*\off+ 64(%rdi)
vmovdqa %ymm6,256*\off+ 96(%rdi)
vmovdqa %ymm5,256*\off+128(%rdi)
vmovdqa %ymm4,256*\off+160(%rdi)
vmovdqa %ymm3,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm

.text
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx)
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx):
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0

levels0t1 0
levels0t1 1
levels0t1 2
levels0t1 3

levels2t7 0
levels2t7 1
levels2t7 2
levels2t7 3

ret


+ 14
- 0
crypto_sign/dilithium2aes/avx2/ntt.h Переглянути файл

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_NTT_H
#define PQCLEAN_DILITHIUM2AES_AVX2_NTT_H

#include <immintrin.h>

void PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata);
void PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata);

void PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx(__m256i *a);

void PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata);
void PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata);

#endif

+ 261
- 0
crypto_sign/dilithium2aes/avx2/packing.c Переглянути файл

@@ -0,0 +1,261 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"


/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES],
const uint8_t rho[SEEDBYTES],
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES],
polyveck *t1,
const uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_sk
*
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t tr[]: byte array containing tr
* - const uint8_t key[]: byte array containing key
* - const polyveck *t0: pointer to vector t0
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk
*
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t tr[]: output byte array for tr
* - const uint8_t key[]: output byte array for key
* - const polyveck *t0: pointer to output vector t0
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES]) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += L * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES);
}
sk += K * POLYETA_PACKEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_sig
*
* Description: Bit-pack signature sig = (c, z, h).
*
* Arguments: - uint8_t sig[]: output byte array
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2AES_AVX2_challenge hash length SEEDBYTES
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES],
const uint8_t c[SEEDBYTES],
const polyvecl *z,
const polyveck *h) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
sig[i] = c[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
}
sig += L * POLYZ_PACKEDBYTES;

/* Encode h */
for (i = 0; i < OMEGA + K; ++i) {
sig[i] = 0;
}

k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t) j;
}
}

sig[OMEGA + i] = (uint8_t) k;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig
*
* Description: Unpack signature sig = (c, z, h).
*
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2AES_AVX2_challenge hash
* - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES],
polyvecl *z,
polyveck *h,
const uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES]) {
unsigned int i, j, k;

for (i = 0; i < SEEDBYTES; ++i) {
c[i] = sig[i];
}
sig += SEEDBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
}
sig += L * POLYZ_PACKEDBYTES;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

return 0;
}

+ 31
- 0
crypto_sign/dilithium2aes/avx2/packing.h Переглянути файл

@@ -0,0 +1,31 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_PACKING_H
#define PQCLEAN_DILITHIUM2AES_AVX2_PACKING_H
#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_DILITHIUM2AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);

void PQCLEAN_DILITHIUM2AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES],
const uint8_t rho[SEEDBYTES],
const uint8_t tr[CRHBYTES],
const uint8_t key[SEEDBYTES],
const polyveck *t0,
const polyvecl *s1,
const polyveck *s2);

void PQCLEAN_DILITHIUM2AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);

void PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES]);

void PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES],
uint8_t tr[CRHBYTES],
uint8_t key[SEEDBYTES],
polyveck *t0,
polyvecl *s1,
polyveck *s2,
const uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES]);

int PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES]);

#endif

+ 41
- 0
crypto_sign/dilithium2aes/avx2/params.h Переглянути файл

@@ -0,0 +1,41 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_PARAMS_H
#define PQCLEAN_DILITHIUM2AES_AVX2_PARAMS_H



#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define D 13
#define ROOT_OF_UNITY 1753

#define K 4
#define L 4
#define ETA 2
#define TAU 39
#define BETA 78
#define GAMMA1 (1 << 17)
#define GAMMA2 ((Q-1)/88)
#define OMEGA 80
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_ALGNAME "Dilithium2-AES"


#define POLYT1_PACKEDBYTES 320
#define POLYT0_PACKEDBYTES 416
#define POLYVECH_PACKEDBYTES (OMEGA + K)

#define POLYZ_PACKEDBYTES 576

#define POLYW1_PACKEDBYTES 192

#define POLYETA_PACKEDBYTES 96

#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
+ L*POLYETA_PACKEDBYTES \
+ K*POLYETA_PACKEDBYTES \
+ K*POLYT0_PACKEDBYTES)
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)

#endif

+ 199
- 0
crypto_sign/dilithium2aes/avx2/pointwise.S Переглянути файл

@@ -0,0 +1,199 @@
#include "params.h"
#include "cdecl.h"

.text
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx)
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx):
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx):
#consts
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop1:
#load
vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa 64(%rsi),%ymm6
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vmovdqa 64(%rdx),%ymm14
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vmovshdup %ymm6,%ymm7
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vmovshdup %ymm14,%ymm15

#mul
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5
vpmuldq %ymm6,%ymm14,%ymm6
vpmuldq %ymm7,%ymm15,%ymm7

#reduce
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm0,%ymm6,%ymm14
vpmuldq %ymm0,%ymm7,%ymm15
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpmuldq %ymm1,%ymm14,%ymm14
vpmuldq %ymm1,%ymm15,%ymm15
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsubq %ymm14,%ymm6,%ymm6
vpsubq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vmovshdup %ymm6,%ymm6

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4
vpblendd $0xAA,%ymm7,%ymm6,%ymm6
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm6,64(%rdi)

add $96,%rdi
add $96,%rsi
add $96,%rdx
add $1,%eax
cmp $10,%eax
jb _looptop1

vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuldq %ymm2,%ymm10,%ymm2
vpmuldq %ymm3,%ymm11,%ymm3
vpmuldq %ymm4,%ymm12,%ymm4
vpmuldq %ymm5,%ymm13,%ymm5

#reduce
vpmuldq %ymm0,%ymm2,%ymm10
vpmuldq %ymm0,%ymm3,%ymm11
vpmuldq %ymm0,%ymm4,%ymm12
vpmuldq %ymm0,%ymm5,%ymm13
vpmuldq %ymm1,%ymm10,%ymm10
vpmuldq %ymm1,%ymm11,%ymm11
vpmuldq %ymm1,%ymm12,%ymm12
vpmuldq %ymm1,%ymm13,%ymm13
vpsubq %ymm10,%ymm2,%ymm2
vpsubq %ymm11,%ymm3,%ymm3
vpsubq %ymm12,%ymm4,%ymm4
vpsubq %ymm13,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vmovshdup %ymm4,%ymm4

#store
vpblendd $0x55,%ymm2,%ymm3,%ymm2
vpblendd $0x55,%ymm4,%ymm5,%ymm4
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

ret

.macro pointwise off
#load
vmovdqa \off(%rsi),%ymm6
vmovdqa \off+32(%rsi),%ymm8
vmovdqa \off(%rdx),%ymm10
vmovdqa \off+32(%rdx),%ymm12
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vmovshdup %ymm10,%ymm11
vmovshdup %ymm12,%ymm13

#mul
vpmuldq %ymm6,%ymm10,%ymm6
vpmuldq %ymm7,%ymm11,%ymm7
vpmuldq %ymm8,%ymm12,%ymm8
vpmuldq %ymm9,%ymm13,%ymm9
.endm

.macro acc
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm7,%ymm3,%ymm3
vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5
.endm

.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx)
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx):
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx):
#consts
vmovdqa _8XQINV*4(%rcx),%ymm0
vmovdqa _8XQ*4(%rcx),%ymm1

xor %eax,%eax
_looptop2:
pointwise 0

#mov
vmovdqa %ymm6,%ymm2
vmovdqa %ymm7,%ymm3
vmovdqa %ymm8,%ymm4
vmovdqa %ymm9,%ymm5

pointwise 1024
acc

pointwise 2048
acc

pointwise 3072
acc




#reduce
vpmuldq %ymm0,%ymm2,%ymm6
vpmuldq %ymm0,%ymm3,%ymm7
vpmuldq %ymm0,%ymm4,%ymm8
vpmuldq %ymm0,%ymm5,%ymm9
vpmuldq %ymm1,%ymm6,%ymm6
vpmuldq %ymm1,%ymm7,%ymm7
vpmuldq %ymm1,%ymm8,%ymm8
vpmuldq %ymm1,%ymm9,%ymm9
vpsubq %ymm6,%ymm2,%ymm2
vpsubq %ymm7,%ymm3,%ymm3
vpsubq %ymm8,%ymm4,%ymm4
vpsubq %ymm9,%ymm5,%ymm5
vpsrlq $32,%ymm2,%ymm2
vmovshdup %ymm4,%ymm4

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4

vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

add $64,%rsi
add $64,%rdx
add $64,%rdi
add $1,%eax
cmp $16,%eax
jb _looptop2

ret

+ 891
- 0
crypto_sign/dilithium2aes/avx2/poly.c Переглянути файл

@@ -0,0 +1,891 @@
#include "align.h"
#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "rejsample.h"
#include "rounding.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

#define DBENCH_START()
#define DBENCH_STOP(t)

#define _mm256_blendv_epi32(a,b,mask) \
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
_mm256_castsi256_ps(b), \
_mm256_castsi256_ps(mask)))

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce
*
* Description: Inplace reduction of all coefficients of polynomial to
* representative in [-6283009,6283007]. Assumes input
* coefficients to be at most 2^31 - 2^22 - 1 in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(poly *a) {
unsigned int i;
__m256i f, g;
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]);
const __m256i off = _mm256_set1_epi32(1 << 22);
DBENCH_START();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a->vec[i]);
g = _mm256_add_epi32(f, off);
g = _mm256_srai_epi32(g, 23);
g = _mm256_mullo_epi32(g, q);
f = _mm256_sub_epi32(f, g);
_mm256_store_si256(&a->vec[i], f);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: poly_addq
*
* Description: For all coefficients of in/out polynomial add Q if
* coefficient is negative.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(poly *a) {
unsigned int i;
__m256i f, g;
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]);
const __m256i zero = _mm256_setzero_si256();
DBENCH_START();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a->vec[i]);
g = _mm256_blendv_epi32(zero, q, f);
f = _mm256_add_epi32(f, g);
_mm256_store_si256(&a->vec[i], f);
}

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze
*
* Description: Inplace reduction of all coefficients of polynomial to
* positive standard representatives. Assumes input
* coefficients to be at most 2^31 - 2^22 + 1 in
* absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(a);
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(a);

DBENCH_STOP(*tred);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_add
*
* Description: Add polynomials. No modular reduction is performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
__m256i f, g;
DBENCH_START();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a->vec[i]);
g = _mm256_load_si256(&b->vec[i]);
f = _mm256_add_epi32(f, g);
_mm256_store_si256(&c->vec[i], f);
}

DBENCH_STOP(*tadd);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_sub
*
* Description: Subtract polynomials. No modular reduction is
* performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
__m256i f, g;
DBENCH_START();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a->vec[i]);
g = _mm256_load_si256(&b->vec[i]);
f = _mm256_sub_epi32(f, g);
_mm256_store_si256(&c->vec[i], f);
}

DBENCH_STOP(*tadd);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl
*
* Description: Multiply polynomial by 2^D without modular reduction. Assumes
* input coefficients to be less than 2^{31-D} in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(poly *a) {
unsigned int i;
__m256i f;
DBENCH_START();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a->vec[i]);
f = _mm256_slli_epi32(f, D);
_mm256_store_si256(&a->vec[i], f);
}

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt
*
* Description: Inplace forward NTT. Coefficients can grow by up to
* 8*Q in absolute value.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont
*
* Description: Inplace inverse NTT and multiplication by 2^{32}.
* Input coefficients need to be less than Q in absolute
* value and output coefficients are again bounded by Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec);

DBENCH_STOP(*tmul);
}

void PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx(a->vec);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery
*
* Description: Pointwise multiplication of polynomials in NTT domain
* representation and multiplication of resulting polynomial
* by 2^{-32}.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec);

DBENCH_STOP(*tmul);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round
*
* Description: For all coefficients c of the input polynomial,
* compute c0, c1 such that c mod^+ Q = c1*2^D + c0
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
* positive standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients c0
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(a1->vec, a0->vec, a->vec);

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose
*
* Description: For all coefficients c of the input polynomial,
* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0
* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
* Assumes coefficients to be positive standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients c0
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(a1->vec, a0->vec, a->vec);

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint
*
* Description: Compute hint array. The coefficients of which are the
* indices of the coefficients of the input polynomial
* whose low bits overflow into the high bits.
*
* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N)
* - const poly *a0: pointer to low part of input polynomial
* - const poly *a1: pointer to high part of input polynomial
*
* Returns number of hints, i.e. length of hint array.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) {
unsigned int r;
DBENCH_START();

r = PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(hint, a0->vec, a1->vec);

DBENCH_STOP(*tround);
return r;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint
*
* Description: Use hint polynomial to correct the high bits of a polynomial.
*
* Arguments: - poly *b: pointer to output polynomial with corrected high bits
* - const poly *a: pointer to input polynomial
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) {
DBENCH_START();

PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(b->vec, a->vec, h->vec);

DBENCH_STOP(*tround);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm
*
* Description: Check infinity norm of polynomial against given bound.
* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce().
*
* Arguments: - const poly *a: pointer to polynomial
* - int32_t B: norm bound
*
* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(const poly *a, int32_t B) {
unsigned int i;
int r;
__m256i f, t;
const __m256i bound = _mm256_set1_epi32(B - 1);
DBENCH_START();

if (B > (Q - 1) / 8) {
return 1;
}

t = _mm256_setzero_si256();
for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a->vec[i]);
f = _mm256_abs_epi32(f);
f = _mm256_cmpgt_epi32(f, bound);
t = _mm256_or_si256(t, f);
}

r = 1 - _mm256_testz_si256(t, t);
DBENCH_STOP(*tsample);
return r;
}

/*************************************************
* Name: rej_uniform
*
* Description: Sample uniformly random coefficients in [0, Q-1] by
* performing rejection sampling on array of random bytes.
*
* Arguments: - int32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_uniform(int32_t *a,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t;
DBENCH_START();

ctr = pos = 0;
while (ctr < len && pos + 3 <= buflen) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
a[ctr++] = t;
}
}

DBENCH_STOP(*tsample);
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform
*
* Description: Sample polynomial with uniformly random coefficients
* in [0,Q-1] by performing rejection sampling on the
* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) {
unsigned int ctr;
/* PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx reads up to 8 additional bytes */
ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf;

stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state);
ctr = PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs);

while (ctr < N) {
/* length of buf is always divisible by 3; hence, no bytes left */
stream128_squeezeblocks(buf.coeffs, 1, state);
ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
stream128_state state;
stream128_init(&state, seed, nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(a, &state);
stream128_release(&state);
}


/*************************************************
* Name: rej_eta
*
* Description: Sample uniformly random coefficients in [-ETA, ETA] by
* performing rejection sampling on array of random bytes.
*
* Arguments: - int32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_eta(int32_t *a,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
DBENCH_START();

ctr = pos = 0;
while (ctr < len && pos < buflen) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 < 15) {
t0 = t0 - (205 * t0 >> 10) * 5;
a[ctr++] = 2 - t0;
}
if (t1 < 15 && ctr < len) {
t1 = t1 - (205 * t1 >> 10) * 5;
a[ctr++] = 2 - t1;
}
}

DBENCH_STOP(*tsample);
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta
*
* Description: Sample polynomial with uniformly random coefficients
* in [-ETA,ETA] by performing rejection sampling using the
* output stream of SHAKE256(seed|nonce)
* or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) {
unsigned int ctr;
ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf;

stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state);
ctr = PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(a->coeffs, buf.coeffs);

while (ctr < N) {
stream128_squeezeblocks(buf.coeffs, 1, state);
ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
stream128_state state;
stream128_init(&state, seed, nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(a, &state);
stream128_release(&state);
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1
*
* Description: Sample polynomial with uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const uint8_t seed[]: byte array with seed of length CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) {
/* PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack reads 14 additional bytes */
ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf;
stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state);
PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(a, buf.coeffs);
}

void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) {
stream256_state state;
stream256_init(&state, seed, nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(a, &state);
stream256_release(&state);
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_challenge
*
* Description: Implementation of H. Samples polynomial with TAU nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(seed).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) {
unsigned int i, b, pos;
uint64_t signs;
ALIGNED_UINT8(SHAKE256_RATE) buf;
shake256incctx state;

shake256_inc_init(&state);
shake256_inc_absorb(&state, seed, SEEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state);

memcpy(&signs, buf.coeffs, 8);
pos = 8;

memset(c->vec, 0, sizeof(poly));
for (i = N - TAU; i < N; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state);
pos = 0;
}

b = buf.coeffs[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1 - 2 * (signs & 1);
signs >>= 1;
}
shake256_inc_ctx_release(&state);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack
*
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYETA_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) {
unsigned int i;
uint8_t t[8];
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
t[0] = ETA - a->coeffs[8 * i + 0];
t[1] = ETA - a->coeffs[8 * i + 1];
t[2] = ETA - a->coeffs[8 * i + 2];
t[3] = ETA - a->coeffs[8 * i + 3];
t[4] = ETA - a->coeffs[8 * i + 4];
t[5] = ETA - a->coeffs[8 * i + 5];
t[6] = ETA - a->coeffs[8 * i + 6];
t[7] = ETA - a->coeffs[8 * i + 7];

r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack
*
* Description: Unpack polynomial with coefficients in [-ETA,ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7;
r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7;
r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7;
r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7;
r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7;
r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7;
r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7;
r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7;

r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0];
r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1];
r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2];
r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3];
r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4];
r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5];
r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6];
r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
* Input coefficients are assumed to be positive standard representatives.
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYT1_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0);
r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2);
r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4);
r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6);
r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2);
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack
*
* Description: Unpack polynomial t1 with 10-bit coefficients.
* Output coefficients are positive standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF;
r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF;
r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF;
r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF;
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack
*
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYT0_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) {
unsigned int i;
uint32_t t[8];
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0];
t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1];
t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2];
t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3];
t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4];
t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5];
t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6];
t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7];

r[13 * i + 0] = t[0];
r[13 * i + 1] = t[0] >> 8;
r[13 * i + 1] |= t[1] << 5;
r[13 * i + 2] = t[1] >> 3;
r[13 * i + 3] = t[1] >> 11;
r[13 * i + 3] |= t[2] << 2;
r[13 * i + 4] = t[2] >> 6;
r[13 * i + 4] |= t[3] << 7;
r[13 * i + 5] = t[3] >> 1;
r[13 * i + 6] = t[3] >> 9;
r[13 * i + 6] |= t[4] << 4;
r[13 * i + 7] = t[4] >> 4;
r[13 * i + 8] = t[4] >> 12;
r[13 * i + 8] |= t[5] << 1;
r[13 * i + 9] = t[5] >> 7;
r[13 * i + 9] |= t[6] << 6;
r[13 * i + 10] = t[6] >> 2;
r[13 * i + 11] = t[6] >> 10;
r[13 * i + 11] |= t[7] << 3;
r[13 * i + 12] = t[7] >> 5;
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack
*
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) {
unsigned int i;
DBENCH_START();

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = a[13 * i + 0];
r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8;
r->coeffs[8 * i + 0] &= 0x1FFF;

r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5;
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3;
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11;
r->coeffs[8 * i + 1] &= 0x1FFF;

r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2;
r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6;
r->coeffs[8 * i + 2] &= 0x1FFF;

r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7;
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1;
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9;
r->coeffs[8 * i + 3] &= 0x1FFF;

r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4;
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4;
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12;
r->coeffs[8 * i + 4] &= 0x1FFF;

r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1;
r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7;
r->coeffs[8 * i + 5] &= 0x1FFF;

r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6;
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2;
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10;
r->coeffs[8 * i + 6] &= 0x1FFF;

r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3;
r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5;
r->coeffs[8 * i + 7] &= 0x1FFF;

r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0];
r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1];
r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2];
r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3];
r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4];
r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5];
r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6];
r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7];
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack
*
* Description: Bit-pack polynomial with coefficients
* in [-(GAMMA1 - 1), GAMMA1].
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYZ_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) {
unsigned int i;
uint32_t t[4];
DBENCH_START();

for (i = 0; i < N / 4; ++i) {
t[0] = GAMMA1 - a->coeffs[4 * i + 0];
t[1] = GAMMA1 - a->coeffs[4 * i + 1];
t[2] = GAMMA1 - a->coeffs[4 * i + 2];
t[3] = GAMMA1 - a->coeffs[4 * i + 3];

r[9 * i + 0] = t[0];
r[9 * i + 1] = t[0] >> 8;
r[9 * i + 2] = t[0] >> 16;
r[9 * i + 2] |= t[1] << 2;
r[9 * i + 3] = t[1] >> 6;
r[9 * i + 4] = t[1] >> 14;
r[9 * i + 4] |= t[2] << 4;
r[9 * i + 5] = t[2] >> 4;
r[9 * i + 6] = t[2] >> 12;
r[9 * i + 6] |= t[3] << 6;
r[9 * i + 7] = t[3] >> 2;
r[9 * i + 8] = t[3] >> 10;
}

DBENCH_STOP(*tpack);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack
*
* Description: Unpack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1].
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 14]) {
unsigned int i;
__m256i f;
const __m256i shufbidx = _mm256_set_epi8(-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1,
-1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0);
const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
const __m256i mask = _mm256_set1_epi32(0x3FFFF);
const __m256i gamma1 = _mm256_set1_epi32(GAMMA1);
DBENCH_START();

for (i = 0; i < N / 8; i++) {
f = _mm256_loadu_si256((__m256i *)&a[18 * i]);
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_srlv_epi32(f, srlvdidx);
f = _mm256_and_si256(f, mask);
f = _mm256_sub_epi32(gamma1, f);
_mm256_store_si256(&r->vec[i], f);
}

DBENCH_STOP(*tpack);
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack
*
* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
* Input coefficients are assumed to be positive standard representatives.
*
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLYW1_PACKEDBYTES bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *restrict a) {
unsigned int i;
__m256i f0, f1, f2, f3;
const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1);
const __m256i shift2 = _mm256_set1_epi32((4096 << 16) + 1);
const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0);
const __m256i shufbidx = _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
DBENCH_START();

for (i = 0; i < N / 32; i++) {
f0 = _mm256_load_si256(&a->vec[4 * i + 0]);
f1 = _mm256_load_si256(&a->vec[4 * i + 1]);
f2 = _mm256_load_si256(&a->vec[4 * i + 2]);
f3 = _mm256_load_si256(&a->vec[4 * i + 3]);
f0 = _mm256_packus_epi32(f0, f1);
f1 = _mm256_packus_epi32(f2, f3);
f0 = _mm256_packus_epi16(f0, f1);
f0 = _mm256_maddubs_epi16(f0, shift1);
f0 = _mm256_madd_epi16(f0, shift2);
f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2);
_mm256_storeu_si256((__m256i *)&r[24 * i], f0);
}

DBENCH_STOP(*tpack);
}

+ 52
- 0
crypto_sign/dilithium2aes/avx2/poly.h Переглянути файл

@@ -0,0 +1,52 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_POLY_H
#define PQCLEAN_DILITHIUM2AES_AVX2_POLY_H
#include "align.h"
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

typedef ALIGNED_INT32(N) poly;

void PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM2AES_AVX2_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h);

int PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(const poly *a, int32_t B);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce);
void PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);


void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]);

void PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a);
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]);

void PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a);

#endif

+ 449
- 0
crypto_sign/dilithium2aes/avx2/polyvec.c Переглянути файл

@@ -0,0 +1,449 @@
#include "aes256ctr.h"
#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

#define UNUSED(x) (void)x

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|j|i)
* or AES256CTR(rho,j|i).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
unsigned int i, j;
uint64_t nonce;
aes256ctr_ctx state;

PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&state, rho, 0);

for (i = 0; i < K; i++) {
for (j = 0; j < L; j++) {
nonce = (i << 8) + j;
state.n = _mm_loadl_epi64((__m128i *)&nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&mat[i].vec[j], &state);
PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&mat[i].vec[j]);
}
}
}


void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
}
}

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_invntt_tomont(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) {
PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce().
*
* Arguments: - const polyvecl *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [-6283009,6283007].
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq
*
* Description: For all coefficients of polynomials in vector of length K
* add Q if coefficient is negative.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{31-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&v->vec[i]);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce().
*
* Arguments: - const polyveck *v: pointer to vector
* - int32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
* and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - uint8_t *hint: pointer to output hint array
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) {
unsigned int i, n = 0;

for (i = 0; i < K; ++i) {
n += PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]);
}

return n;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
}
}

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]);
}
}

+ 64
- 0
crypto_sign/dilithium2aes/avx2/polyvec.h Переглянути файл

@@ -0,0 +1,64 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_POLYVEC_H
#define PQCLEAN_DILITHIUM2AES_AVX2_POLYVEC_H
#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(polyvecl *v);

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_invntt_tomont(polyvecl *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B);

/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);

int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t B);

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1);
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);

void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1);

void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);


void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);

#endif

+ 394
- 0
crypto_sign/dilithium2aes/avx2/rejsample.c Переглянути файл

@@ -0,0 +1,394 @@
#include "params.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>

const uint8_t PQCLEAN_DILITHIUM2AES_AVX2_idxlut[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};

unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
unsigned int ctr, pos;
uint32_t good;
__m256i d, tmp;
const __m256i bound = _mm256_set1_epi32(Q);
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
-1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6,
-1, 5, 4, 3, -1, 2, 1, 0);

ctr = pos = 0;
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
d = _mm256_permute4x64_epi64(d, 0x94);
d = _mm256_shuffle_epi8(d, idx8);
d = _mm256_and_si256(d, mask);
pos += 24;

tmp = _mm256_sub_epi32(d, bound);
good = _mm256_movemask_ps((__m256)tmp);
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good]));
d = _mm256_permutevar8x32_epi32(d, tmp);

_mm256_storeu_si256((__m256i *)&r[ctr], d);
ctr += _mm_popcnt_u32(good);

}


return ctr;
}

unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
unsigned int ctr, pos;
uint32_t good;
__m256i f0, f1, f2;
__m128i g0, g1;
const __m256i mask = _mm256_set1_epi8(15);
const __m256i eta = _mm256_set1_epi8(ETA);
const __m256i bound = mask;
const __m256i v = _mm256_set1_epi32(-6560);
const __m256i p = _mm256_set1_epi32(5);

ctr = pos = 0;
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
f1 = _mm256_slli_epi16(f0, 4);
f0 = _mm256_or_si256(f0, f1);
f0 = _mm256_and_si256(f0, mask);

f1 = _mm256_sub_epi8(f0, bound);
f0 = _mm256_sub_epi8(eta, f0);
good = _mm256_movemask_epi8(f1);

g0 = _mm256_castsi256_si128(f0);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm256_extracti128_si256(f0, 1);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;

if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good);
pos += 4;
}

uint32_t t0, t1;
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 < 15) {
t0 = t0 - (205 * t0 >> 10) * 5;
r[ctr++] = 2 - t0;
}
if (t1 < 15 && ctr < N) {
t1 = t1 - (205 * t1 >> 10) * 5;
r[ctr++] = 2 - t1;
}
}

return ctr;
}

+ 19
- 0
crypto_sign/dilithium2aes/avx2/rejsample.h Переглянути файл

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_REJSAMPLE_H
#define PQCLEAN_DILITHIUM2AES_AVX2_REJSAMPLE_H
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES)

#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES)
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)

extern const uint8_t PQCLEAN_DILITHIUM2AES_AVX2_idxlut[256][8];

unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]);

unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]);

#endif

+ 157
- 0
crypto_sign/dilithium2aes/avx2/rounding.c Переглянути файл

@@ -0,0 +1,157 @@
#include "consts.h"
#include "params.h"
#include "rejsample.h"
#include "rounding.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

#define _mm256_blendv_epi32(a,b,mask) \
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
_mm256_castsi256_ps(b), \
_mm256_castsi256_ps(mask)))

/*************************************************
* Name: power2round
*
* Description: For finite field elements a, compute a0, a1 such that
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be positive standard representative.
*
* Arguments: - __m256i *a1: output array of length N/8 with high bits
* - __m256i *a0: output array of length N/8 with low bits a0
* - const __m256i *a: input array of length N/8
*
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1;
const __m256i mask = _mm256_set1_epi32(-(1 << D));
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1);

for (i = 0; i < N / 8; ++i) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, half);
f0 = _mm256_and_si256(f1, mask);
f1 = _mm256_srli_epi32(f1, D);
f0 = _mm256_sub_epi32(f, f0);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: decompose
*
* Description: For finite field element a, compute high and low parts a0, a1 such
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard
* representative.
*
* Arguments: - __m256i *a1: output array of length N/8 with high parts
* - __m256i *a0: output array of length N/8 with low parts a0
* - const __m256i *a: input array of length N/8
*
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) {
unsigned int i;
__m256i f, f0, f1, t;
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]);
const __m256i hq = _mm256_srli_epi32(q, 1);
const __m256i v = _mm256_set1_epi32(11275);
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2);
const __m256i off = _mm256_set1_epi32(127);
const __m256i shift = _mm256_set1_epi32(128);
const __m256i max = _mm256_set1_epi32(43);
const __m256i zero = _mm256_setzero_si256();

for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a[i]);
f1 = _mm256_add_epi32(f, off);
f1 = _mm256_srli_epi32(f1, 7);
f1 = _mm256_mulhi_epu16(f1, v);
f1 = _mm256_mulhrs_epi16(f1, shift);
t = _mm256_sub_epi32(max, f1);
f1 = _mm256_blendv_epi32(f1, zero, t);
f0 = _mm256_mullo_epi32(f1, alpha);
f0 = _mm256_sub_epi32(f, f0);
f = _mm256_cmpgt_epi32(f0, hq);
f = _mm256_and_si256(f, q);
f0 = _mm256_sub_epi32(f0, f);
_mm256_store_si256(&a1[i], f1);
_mm256_store_si256(&a0[i], f0);
}
}

/*************************************************
* Name: make_hint
*
* Description: Compute indices of polynomial coefficients whose low bits
* overflow into the high bits.
*
* Arguments: - uint8_t *hint: hint array
* - const __m256i *a0: low bits of input elements
* - const __m256i *a1: high bits of input elements
*
* Returns number of overflowing low bits
**************************************************/
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) {
unsigned int i, n = 0;
__m256i f0, f1, g0, g1;
uint32_t bad;
uint64_t idx;
const __m256i low = _mm256_set1_epi32(-GAMMA2);
const __m256i high = _mm256_set1_epi32(GAMMA2);

for (i = 0; i < N / 8; ++i) {
f0 = _mm256_load_si256(&a0[i]);
f1 = _mm256_load_si256(&a1[i]);
g0 = _mm256_abs_epi32(f0);
g0 = _mm256_cmpgt_epi32(g0, high);
g1 = _mm256_cmpeq_epi32(f0, low);
g1 = _mm256_sign_epi32(g1, f1);
g0 = _mm256_or_si256(g0, g1);

bad = _mm256_movemask_ps((__m256)g0);
memcpy(&idx, PQCLEAN_DILITHIUM2AES_AVX2_idxlut[bad], 8);
idx += (uint64_t)0x0808080808080808 * i;
memcpy(&hint[n], &idx, 8);
n += _mm_popcnt_u32(bad);
}

return n;
}

/*************************************************
* Name: use_hint
*
* Description: Correct high parts according to hint.
*
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts
* - const __m256i *a: input array of length N/8
* - const __m256i *a: input array of length N/8 with hint bits
*
**************************************************/
void PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) {
unsigned int i;
__m256i a0[N / 8];
__m256i f, g, h, t;
const __m256i zero = _mm256_setzero_si256();
const __m256i max = _mm256_set1_epi32(43);

PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(b, a0, a);
for (i = 0; i < N / 8; i++) {
f = _mm256_load_si256(&a0[i]);
g = _mm256_load_si256(&b[i]);
h = _mm256_load_si256(&hint[i]);
t = _mm256_blendv_epi32(zero, h, f);
t = _mm256_slli_epi32(t, 1);
h = _mm256_sub_epi32(h, t);
g = _mm256_add_epi32(g, h);
g = _mm256_blendv_epi32(g, max, g);
f = _mm256_cmpgt_epi32(g, max);
g = _mm256_blendv_epi32(g, zero, f);
_mm256_store_si256(&b[i], g);
}
}

+ 12
- 0
crypto_sign/dilithium2aes/avx2/rounding.h Переглянути файл

@@ -0,0 +1,12 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_ROUNDING_H
#define PQCLEAN_DILITHIUM2AES_AVX2_ROUNDING_H
#include "params.h"
#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a);
void PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a);
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1);
void PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint);

#endif

+ 54
- 0
crypto_sign/dilithium2aes/avx2/shuffle.S Переглянути файл

@@ -0,0 +1,54 @@
#include "cdecl.h"
.include "shuffle.inc"

.text
nttunpack128_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

#store
vmovdqa %ymm9,(%rdi)
vmovdqa %ymm8,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm6,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm3,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx)
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx)
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx):
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx):
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
ret

+ 25
- 0
crypto_sign/dilithium2aes/avx2/shuffle.inc Переглянути файл

@@ -0,0 +1,25 @@
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

+ 425
- 0
crypto_sign/dilithium2aes/avx2/sign.c Переглянути файл

@@ -0,0 +1,425 @@
#include "aes256ctr.h"
#include "align.h"
#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"
#include <stdint.h>
#include <string.h>


/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
uint8_t seedbuf[3 * SEEDBYTES];
const uint8_t *rho, *rhoprime, *key;
uint64_t nonce;
aes256ctr_ctx aesctx;
polyvecl rowbuf[1];
polyvecl s1, *row = rowbuf;
polyveck s2;
poly t1, t0;

/* Get randomness for rho, rhoprime and key */
randombytes(seedbuf, SEEDBYTES);
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Store rho, key */
memcpy(pk, rho, SEEDBYTES);
memcpy(sk, rho, SEEDBYTES);
memcpy(sk + SEEDBYTES, key, SEEDBYTES);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0);
for (i = 0; i < L; ++i) {
nonce = i;
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(&s1.vec[i], &aesctx);
}
for (i = 0; i < K; ++i) {
nonce = L + i;
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(&s2.vec[i], &aesctx);
}

/* Pack secret vectors */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]);
}
for (i = 0; i < K; i++) {
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]);
}

/* Transform s1 */
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&s1);

PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rho, 0);

for (i = 0; i < K; i++) {
/* Expand matrix row */
for (unsigned int j = 0; j < L; j++) {
nonce = (i << 8) + j;
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx);
PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&row->vec[j]);
}

/* Compute inner-product */
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1);
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&t1);

/* Add error polynomial */
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&t1, &t1, &s2.vec[i]);

/* Round t and pack t1, t0 */
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&t1);
PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(&t1, &t0, &t1);
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1);
PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0);
}

/* Compute CRH(rho, t1) and store in secret key */
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature
*
* Description: Computes signature.
*
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES)
* - size_t *siglen: pointer to output length of signature
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
unsigned int i, n, pos;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint8_t hintbuf[N];
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
uint64_t nonce = 0;
polyvecl mat[K], s1, z;
polyveck t0, s2, w1;
poly c, tmp;
union {
polyvecl y;
polyveck w0;
} tmpv;
shake256incctx state;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);

/* Compute CRH(tr, msg) */
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(mat, rho);
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(&t0);

aes256ctr_ctx aesctx;
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0);

rej:
/* Sample intermediate vector y */
for (i = 0; i < L; ++i) {
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce);
nonce++;
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(&z.vec[i], &aesctx);
}

/* Matrix-vector product */
tmpv.y = z;
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&tmpv.y);
PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y);
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(&w1);

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(&w1);
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1);
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(sig, &w1);

shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(sig, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&c);

/* Compute z, reject if it reveals secret */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]);
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp);
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&z.vec[i]);
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) {
goto rej;
}
}

/* Zero hint vector in signature */
pos = 0;
memset(hint, 0, OMEGA);

for (i = 0; i < K; i++) {
/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]);
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&tmpv.w0.vec[i]);
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) {
goto rej;
}

/* Compute hints */
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]);
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp);
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&tmp);
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&tmp, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp);
n = PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]);
if (pos + n > OMEGA) {
goto rej;
}

/* Store hints in signature */
memcpy(&hint[pos], hintbuf, n);
hint[OMEGA + i] = pos = pos + n;
}

/* Pack z into signature */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]);
}

*siglen = PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - size_t *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) {
size_t i;

for (i = 0; i < mlen; ++i) {
sm[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
}
PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, mlen, sk);
*smlen += mlen;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify
*
* Description: Verifies signature.
*
* Arguments: - uint8_t *m: pointer to input signature
* - size_t siglen: length of signature
* - const uint8_t *m: pointer to message
* - size_t mlen: length of message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signature could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) {
unsigned int i, j, pos = 0;
/* PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack writes additional 14 bytes */
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf;
uint8_t mu[CRHBYTES];
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES;
uint64_t nonce;
aes256ctr_ctx aesctx;
polyvecl rowbuf[1];
polyvecl *row = rowbuf;
polyvecl z;
poly c, w1, h;
shake256incctx state;

if (siglen != PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES);
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);
shake256_inc_ctx_release(&state);

/* Expand PQCLEAN_DILITHIUM2AES_AVX2_challenge */
PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(&c, sig);
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&c);

/* Unpack z; shortness follows from unpacking */
for (i = 0; i < L; i++) {
PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES);
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&z.vec[i]);
}

PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, pk, 0);

for (i = 0; i < K; i++) {
/* Expand matrix row */
for (j = 0; j < L; j++) {
nonce = (i << 8) + j;
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce);
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx);
PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&row->vec[j]);
}

/* Compute i-th row of Az - c2^Dt1 */
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z);

PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES);
PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(&h);
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&h);
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&h, &c, &h);

PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&w1, &w1, &h);
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&w1);
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&w1);

/* Get hint polynomial and reconstruct w1 */
memset(h.vec, 0, sizeof(poly));
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) {
return -1;
}

for (j = pos; j < hint[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > pos && hint[j] <= hint[j - 1]) {
return -1;
}
h.coeffs[hint[j]] = 1;
}
pos = hint[OMEGA + i];

PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&w1);
PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(&w1, &w1, &h);
PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1);
}

/* Extra indices are zero for strong unforgeability */
for (j = pos; j < OMEGA; ++j) {
if (hint[j]) {
return -1;
}
}

/* Call random oracle and verify PQCLEAN_DILITHIUM2AES_AVX2_challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES);
shake256_inc_finalize(&state);
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state);
shake256_inc_ctx_release(&state);
for (i = 0; i < SEEDBYTES; ++i) {
if (buf.coeffs[i] != sig[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - size_t *mlen: pointer to output length of message
* - const uint8_t *sm: pointer to signed message
* - size_t smlen: length of signed message
* - const uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) {
size_t i;

if (smlen < PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) {
goto badsig;
}

*mlen = smlen - PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES;
if (PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + i];
}
return 0;
}

badsig:
/* Signature verification failed */
*mlen = -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 29
- 0
crypto_sign/dilithium2aes/avx2/sign.h Переглянути файл

@@ -0,0 +1,29 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_SIGN_H
#define PQCLEAN_DILITHIUM2AES_AVX2_SIGN_H
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stddef.h>
#include <stdint.h>

void PQCLEAN_DILITHIUM2AES_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

+ 25
- 0
crypto_sign/dilithium2aes/avx2/symmetric.h Переглянути файл

@@ -0,0 +1,25 @@
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2AES_AVX2_SYMMETRIC_H
#include "aes256ctr.h"
#include "fips202.h"
#include "params.h"
#include <stdint.h>



typedef aes256ctr_ctx stream128_state;
typedef aes256ctr_ctx stream256_state;

#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES
#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream128_release(STATE)
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream256_release(STATE)


#endif

+ 5
- 0
crypto_sign/dilithium2aes/clean/LICENSE Переглянути файл

@@ -0,0 +1,5 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in
comments on top of the respective files.

+ 19
- 0
crypto_sign/dilithium2aes/clean/Makefile Переглянути файл

@@ -0,0 +1,19 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libdilithium2aes_clean.a
HEADERS=aes256ctr.h api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h
OBJECTS=aes256ctr.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-aes.o

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

Деякі файли не було показано, через те що забагато файлів було змінено

Завантаження…
Відмінити
Зберегти