* Update Dilithium * Alternative montgomery reduce to avoid i386 functest errors * Explicit casts for msvc * More casts; bump upstream version; fix metadata * another casttags/v0.0.1
@@ -1,88 +1,91 @@ | |||
![Test sphincs-haraka-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-simple/badge.svg?branch=master) | |||
![Test sphincs-haraka-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-robust/badge.svg?branch=master) | |||
![Test sphincs-sha256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-simple/badge.svg?branch=master) | |||
![Test dilithium2](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2/badge.svg?branch=master) | |||
![Test sphincs-shake256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-robust/badge.svg?branch=master) | |||
![Test rainbowIII-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-compressed/badge.svg?branch=master) | |||
![Test sphincs-haraka-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-simple/badge.svg?branch=master) | |||
![Test sphincs-haraka-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-robust/badge.svg?branch=master) | |||
![Test rainbowV-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-circumzenithal/badge.svg?branch=master) | |||
![Test sphincs-shake256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-robust/badge.svg?branch=master) | |||
![Test rainbowIII-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-circumzenithal/badge.svg?branch=master) | |||
![Test sphincs-sha256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192f-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-simple/badge.svg?branch=master) | |||
![Test rainbowIII-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-classic/badge.svg?branch=master) | |||
![Test sphincs-shake256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-simple/badge.svg?branch=master) | |||
![Test rainbowI-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-circumzenithal/badge.svg?branch=master) | |||
![Test sphincs-sha256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-robust/badge.svg?branch=master) | |||
![Test rainbowV-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-compressed/badge.svg?branch=master) | |||
![Test rainbowV-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-classic/badge.svg?branch=master) | |||
![Test sphincs-sha256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-simple/badge.svg?branch=master) | |||
![Test falcon-512](https://github.com/PQClean/PQClean/workflows/Test%20falcon-512/badge.svg?branch=master) | |||
![Test falcon-1024](https://github.com/PQClean/PQClean/workflows/Test%20falcon-1024/badge.svg?branch=master) | |||
![Test sphincs-haraka-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-robust/badge.svg?branch=master) | |||
![Test dilithium4](https://github.com/PQClean/PQClean/workflows/Test%20dilithium4/badge.svg?branch=master) | |||
![Test sphincs-sha256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-simple/badge.svg?branch=master) | |||
![Test rainbowI-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-classic/badge.svg?branch=master) | |||
![Test sphincs-haraka-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-robust/badge.svg?branch=master) | |||
![Test sphincs-sha256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-robust/badge.svg?branch=master) | |||
![Test rainbowI-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-compressed/badge.svg?branch=master) | |||
![Test rainbowIII-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-classic/badge.svg?branch=master) | |||
![Test sphincs-sha256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-simple/badge.svg?branch=master) | |||
![Test sphincs-haraka-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192f-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-simple/badge.svg?branch=master) | |||
![Test sphincs-haraka-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-robust/badge.svg?branch=master) | |||
![Test dilithium3](https://github.com/PQClean/PQClean/workflows/Test%20dilithium3/badge.svg?branch=master) | |||
![Test rainbowI-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-compressed/badge.svg?branch=master) | |||
![Test dilithium2](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2/badge.svg?branch=master) | |||
![Test sphincs-sha256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-robust/badge.svg?branch=master) | |||
![Test dilithium5](https://github.com/PQClean/PQClean/workflows/Test%20dilithium5/badge.svg?branch=master) | |||
![Test sphincs-haraka-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-simple/badge.svg?branch=master) | |||
![Test sphincs-haraka-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-simple/badge.svg?branch=master) | |||
![Test sphincs-sha256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-simple/badge.svg?branch=master) | |||
![Test rainbowV-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-classic/badge.svg?branch=master) | |||
![Test sphincs-sha256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-simple/badge.svg?branch=master) | |||
![Test sphincs-haraka-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-simple/badge.svg?branch=master) | |||
![Test rainbowV-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-compressed/badge.svg?branch=master) | |||
![Test sphincs-sha256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-simple/badge.svg?branch=master) | |||
![Test dilithium5aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium5aes/badge.svg?branch=master) | |||
![Test dilithium2aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2aes/badge.svg?branch=master) | |||
![Test sphincs-sha256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-simple/badge.svg?branch=master) | |||
![Test mceliece460896](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896/badge.svg?branch=master) | |||
![Test saber](https://github.com/PQClean/PQClean/workflows/Test%20saber/badge.svg?branch=master) | |||
![Test kyber1024-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024-90s/badge.svg?branch=master) | |||
![Test kyber1024](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024/badge.svg?branch=master) | |||
![Test mceliece8192128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128/badge.svg?branch=master) | |||
![Test sphincs-sha256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-simple/badge.svg?branch=master) | |||
![Test rainbowIII-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-compressed/badge.svg?branch=master) | |||
![Test sphincs-sha256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-robust/badge.svg?branch=master) | |||
![Test falcon-512](https://github.com/PQClean/PQClean/workflows/Test%20falcon-512/badge.svg?branch=master) | |||
![Test sphincs-shake256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-robust/badge.svg?branch=master) | |||
![Test falcon-1024](https://github.com/PQClean/PQClean/workflows/Test%20falcon-1024/badge.svg?branch=master) | |||
![Test sphincs-haraka-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-robust/badge.svg?branch=master) | |||
![Test sphincs-shake256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-simple/badge.svg?branch=master) | |||
![Test sphincs-shake256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-robust/badge.svg?branch=master) | |||
![Test sphincs-sha256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-robust/badge.svg?branch=master) | |||
![Test sphincs-haraka-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-simple/badge.svg?branch=master) | |||
![Test dilithium3aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium3aes/badge.svg?branch=master) | |||
![Test kyber512-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber512-90s/badge.svg?branch=master) | |||
![Test firesaber](https://github.com/PQClean/PQClean/workflows/Test%20firesaber/badge.svg?branch=master) | |||
![Test frodokem1344aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344aes/badge.svg?branch=master) | |||
![Test sntrup653](https://github.com/PQClean/PQClean/workflows/Test%20sntrup653/badge.svg?branch=master) | |||
![Test mceliece6688128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128/badge.svg?branch=master) | |||
![Test ntrulpr761](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr761/badge.svg?branch=master) | |||
![Test frodokem976aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976aes/badge.svg?branch=master) | |||
![Test hqc-rmrs-192](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-192/badge.svg?branch=master) | |||
![Test sntrup857](https://github.com/PQClean/PQClean/workflows/Test%20sntrup857/badge.svg?branch=master) | |||
![Test frodokem640aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640aes/badge.svg?branch=master) | |||
![Test hqc-rmrs-128](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-128/badge.svg?branch=master) | |||
![Test mceliece6960119](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119/badge.svg?branch=master) | |||
![Test ntruhrss701](https://github.com/PQClean/PQClean/workflows/Test%20ntruhrss701/badge.svg?branch=master) | |||
![Test ntrulpr857](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr857/badge.svg?branch=master) | |||
![Test frodokem1344shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344shake/badge.svg?branch=master) | |||
![Test mceliece6688128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128f/badge.svg?branch=master) | |||
![Test ntruhps2048677](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048677/badge.svg?branch=master) | |||
![Test frodokem640aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640aes/badge.svg?branch=master) | |||
![Test sntrup761](https://github.com/PQClean/PQClean/workflows/Test%20sntrup761/badge.svg?branch=master) | |||
![Test hqc-rmrs-256](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-256/badge.svg?branch=master) | |||
![Test frodokem976shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976shake/badge.svg?branch=master) | |||
![Test mceliece348864](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864/badge.svg?branch=master) | |||
![Test frodokem1344aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344aes/badge.svg?branch=master) | |||
![Test hqc-rmrs-192](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-192/badge.svg?branch=master) | |||
![Test frodokem976aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976aes/badge.svg?branch=master) | |||
![Test mceliece8192128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128f/badge.svg?branch=master) | |||
![Test mceliece460896f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896f/badge.svg?branch=master) | |||
![Test kyber512-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber512-90s/badge.svg?branch=master) | |||
![Test kyber1024](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024/badge.svg?branch=master) | |||
![Test mceliece348864f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864f/badge.svg?branch=master) | |||
![Test mceliece6960119f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119f/badge.svg?branch=master) | |||
![Test firesaber](https://github.com/PQClean/PQClean/workflows/Test%20firesaber/badge.svg?branch=master) | |||
![Test sntrup857](https://github.com/PQClean/PQClean/workflows/Test%20sntrup857/badge.svg?branch=master) | |||
![Test frodokem640shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640shake/badge.svg?branch=master) | |||
![Test sntrup761](https://github.com/PQClean/PQClean/workflows/Test%20sntrup761/badge.svg?branch=master) | |||
![Test ntruhps4096821](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps4096821/badge.svg?branch=master) | |||
![Test ntruhrss701](https://github.com/PQClean/PQClean/workflows/Test%20ntruhrss701/badge.svg?branch=master) | |||
![Test mceliece348864](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864/badge.svg?branch=master) | |||
![Test ntrulpr653](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr653/badge.svg?branch=master) | |||
![Test sntrup653](https://github.com/PQClean/PQClean/workflows/Test%20sntrup653/badge.svg?branch=master) | |||
![Test lightsaber](https://github.com/PQClean/PQClean/workflows/Test%20lightsaber/badge.svg?branch=master) | |||
![Test kyber1024-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024-90s/badge.svg?branch=master) | |||
![Test ntruhps2048509](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048509/badge.svg?branch=master) | |||
![Test kyber768-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber768-90s/badge.svg?branch=master) | |||
![Test mceliece6960119f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119f/badge.svg?branch=master) | |||
![Test saber](https://github.com/PQClean/PQClean/workflows/Test%20saber/badge.svg?branch=master) | |||
![Test kyber768](https://github.com/PQClean/PQClean/workflows/Test%20kyber768/badge.svg?branch=master) | |||
![Test ntruhps2048509](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048509/badge.svg?branch=master) | |||
![Test ntruhps4096821](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps4096821/badge.svg?branch=master) | |||
![Test ntrulpr761](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr761/badge.svg?branch=master) | |||
![Test kyber512](https://github.com/PQClean/PQClean/workflows/Test%20kyber512/badge.svg?branch=master) | |||
![Test hqc-rmrs-128](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-128/badge.svg?branch=master) | |||
![Test mceliece6688128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128/badge.svg?branch=master) | |||
![Test lightsaber](https://github.com/PQClean/PQClean/workflows/Test%20lightsaber/badge.svg?branch=master) | |||
![Test mceliece460896f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896f/badge.svg?branch=master) | |||
![Test mceliece8192128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128/badge.svg?branch=master) | |||
![Test mceliece460896](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896/badge.svg?branch=master) | |||
![Test ntruhps2048677](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048677/badge.svg?branch=master) | |||
![Test ntrulpr857](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr857/badge.svg?branch=master) | |||
![Test hqc-rmrs-256](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-256/badge.svg?branch=master) | |||
![Test frodokem976shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976shake/badge.svg?branch=master) | |||
![Test frodokem640shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640shake/badge.svg?branch=master) |
@@ -0,0 +1,204 @@ | |||
on: | |||
push: | |||
paths: | |||
# build if tests change | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium2aes*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium2aes/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium2aes.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
pull_request: | |||
paths: | |||
# build if tests change | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium2aes*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium2aes/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium2aes.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
schedule: | |||
- cron: '5 4 * * *' | |||
name: Test dilithium2aes | |||
jobs: | |||
test-native: | |||
runs-on: ubuntu-latest | |||
container: | |||
image: pqclean/ci-container:${{ matrix.arch }} | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium2aes | |||
CC: ccache ${{ matrix.cc }} | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_DIR: /ccache | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
strategy: | |||
matrix: | |||
arch: | |||
- amd64 | |||
- i386 | |||
cc: | |||
- gcc | |||
- clang | |||
steps: | |||
- name: Cancel Previous Runs | |||
uses: thomwiggers/cancel-workflow-action@all_but_latest | |||
with: | |||
all_but_latest: true | |||
access_token: ${{ github.token }} | |||
continue-on-error: true | |||
if: matrix.arch == 'amd64' && matrix.cc == 'gcc' | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Cache ccache | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-ccache | |||
with: | |||
path: /ccache | |||
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} | |||
- name: Cache pip | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-python-pip | |||
with: | |||
path: ~/.cache/pip | |||
key: v1-python-pip | |||
- name: Install python dependencies | |||
run: | | |||
python3 -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
cd test | |||
python3 -m pytest --verbose --numprocesses=auto | |||
test-emulated: | |||
needs: | |||
- test-native | |||
runs-on: ubuntu-latest | |||
strategy: | |||
matrix: | |||
arch: | |||
- armhf | |||
- unstable-ppc | |||
cc: | |||
- gcc | |||
- clang | |||
env: | |||
CC: ${{ matrix.cc }} | |||
steps: | |||
- name: Register qemu-user-static | |||
run: | | |||
docker run --rm --privileged multiarch/qemu-user-static:register --reset | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Cache ccache | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-ccache | |||
with: | |||
path: ~/ccache | |||
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} | |||
- name: Cache pip | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-python-pip | |||
with: | |||
path: ~/.cache/pip | |||
key: v1-python-pip | |||
- name: Run tests in container | |||
run: | | |||
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium2aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ | |||
export CCACHE_NOSTATS=1 && \ | |||
export CCACHE_DIR=/ccache && \ | |||
export CCACHE_SLOPPINESS=include_file_mtime && \ | |||
export CC=\"ccache $CC\" && \ | |||
pip3 install -U -r requirements.txt && \ | |||
cd test && \ | |||
python3 -m pytest --verbose --numprocesses=auto" | |||
test-windows: | |||
needs: | |||
- test-native | |||
strategy: | |||
matrix: | |||
bits: | |||
- 64 | |||
- 32 | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium2aes | |||
runs-on: windows-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Setup astyle | |||
run: | | |||
# Setup strong crypto | |||
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord | |||
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord | |||
Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe" | |||
shell: powershell | |||
- name: Setup Python | |||
uses: actions/setup-python@main | |||
with: | |||
python-version: "3.x" | |||
- name: Install python requirements | |||
run: python -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat" | |||
cd test | |||
python -m pytest --verbose --numprocesses=auto | |||
shell: cmd | |||
test-macos: | |||
needs: | |||
- test-native | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium2aes | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
# XCode version | |||
DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer | |||
strategy: | |||
matrix: | |||
compiler: | |||
- clang # XCode (Apple LLVM/Clang) | |||
- gcc9 # GNU (Homebrew) | |||
runs-on: macos-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Install astyle | |||
run: | | |||
brew install astyle | |||
- name: Set up GCC9 compiler | |||
run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9' | |||
if: matrix.compiler == 'gcc9' | |||
- name: Setup Python | |||
uses: actions/setup-python@main | |||
with: | |||
python-version: "3.x" | |||
- name: Install Python dependencies | |||
run: python -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
cd test | |||
python -m pytest --verbose --numprocesses=auto | |||
@@ -0,0 +1,204 @@ | |||
on: | |||
push: | |||
paths: | |||
# build if tests change | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium3aes*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium3aes/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium3aes.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
pull_request: | |||
paths: | |||
# build if tests change | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium3aes*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium3aes/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium3aes.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
schedule: | |||
- cron: '5 4 * * *' | |||
name: Test dilithium3aes | |||
jobs: | |||
test-native: | |||
runs-on: ubuntu-latest | |||
container: | |||
image: pqclean/ci-container:${{ matrix.arch }} | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium3aes | |||
CC: ccache ${{ matrix.cc }} | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_DIR: /ccache | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
strategy: | |||
matrix: | |||
arch: | |||
- amd64 | |||
- i386 | |||
cc: | |||
- gcc | |||
- clang | |||
steps: | |||
- name: Cancel Previous Runs | |||
uses: thomwiggers/cancel-workflow-action@all_but_latest | |||
with: | |||
all_but_latest: true | |||
access_token: ${{ github.token }} | |||
continue-on-error: true | |||
if: matrix.arch == 'amd64' && matrix.cc == 'gcc' | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Cache ccache | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-ccache | |||
with: | |||
path: /ccache | |||
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} | |||
- name: Cache pip | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-python-pip | |||
with: | |||
path: ~/.cache/pip | |||
key: v1-python-pip | |||
- name: Install python dependencies | |||
run: | | |||
python3 -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
cd test | |||
python3 -m pytest --verbose --numprocesses=auto | |||
test-emulated: | |||
needs: | |||
- test-native | |||
runs-on: ubuntu-latest | |||
strategy: | |||
matrix: | |||
arch: | |||
- armhf | |||
- unstable-ppc | |||
cc: | |||
- gcc | |||
- clang | |||
env: | |||
CC: ${{ matrix.cc }} | |||
steps: | |||
- name: Register qemu-user-static | |||
run: | | |||
docker run --rm --privileged multiarch/qemu-user-static:register --reset | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Cache ccache | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-ccache | |||
with: | |||
path: ~/ccache | |||
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} | |||
- name: Cache pip | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-python-pip | |||
with: | |||
path: ~/.cache/pip | |||
key: v1-python-pip | |||
- name: Run tests in container | |||
run: | | |||
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium3aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ | |||
export CCACHE_NOSTATS=1 && \ | |||
export CCACHE_DIR=/ccache && \ | |||
export CCACHE_SLOPPINESS=include_file_mtime && \ | |||
export CC=\"ccache $CC\" && \ | |||
pip3 install -U -r requirements.txt && \ | |||
cd test && \ | |||
python3 -m pytest --verbose --numprocesses=auto" | |||
test-windows: | |||
needs: | |||
- test-native | |||
strategy: | |||
matrix: | |||
bits: | |||
- 64 | |||
- 32 | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium3aes | |||
runs-on: windows-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Setup astyle | |||
run: | | |||
# Setup strong crypto | |||
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord | |||
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord | |||
Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe" | |||
shell: powershell | |||
- name: Setup Python | |||
uses: actions/setup-python@main | |||
with: | |||
python-version: "3.x" | |||
- name: Install python requirements | |||
run: python -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat" | |||
cd test | |||
python -m pytest --verbose --numprocesses=auto | |||
shell: cmd | |||
test-macos: | |||
needs: | |||
- test-native | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium3aes | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
# XCode version | |||
DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer | |||
strategy: | |||
matrix: | |||
compiler: | |||
- clang # XCode (Apple LLVM/Clang) | |||
- gcc9 # GNU (Homebrew) | |||
runs-on: macos-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Install astyle | |||
run: | | |||
brew install astyle | |||
- name: Set up GCC9 compiler | |||
run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9' | |||
if: matrix.compiler == 'gcc9' | |||
- name: Setup Python | |||
uses: actions/setup-python@main | |||
with: | |||
python-version: "3.x" | |||
- name: Install Python dependencies | |||
run: python -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
cd test | |||
python -m pytest --verbose --numprocesses=auto | |||
@@ -5,13 +5,13 @@ on: | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium4*.yml' | |||
- 'test/duplicate_consistency/dilithium5*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium4/**' | |||
- 'crypto_sign/dilithium5/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium4.yml' | |||
- '.github/workflows/sign_dilithium5.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
@@ -21,20 +21,20 @@ on: | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium4*.yml' | |||
- 'test/duplicate_consistency/dilithium5*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium4/**' | |||
- 'crypto_sign/dilithium5/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium4.yml' | |||
- '.github/workflows/sign_dilithium5.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
schedule: | |||
- cron: '5 4 * * *' | |||
name: Test dilithium4 | |||
name: Test dilithium5 | |||
jobs: | |||
test-native: | |||
@@ -42,7 +42,7 @@ jobs: | |||
container: | |||
image: pqclean/ci-container:${{ matrix.arch }} | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium4 | |||
PQCLEAN_ONLY_SCHEMES: dilithium5 | |||
CC: ccache ${{ matrix.cc }} | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_DIR: /ccache | |||
@@ -124,7 +124,7 @@ jobs: | |||
key: v1-python-pip | |||
- name: Run tests in container | |||
run: | | |||
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium4 -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ | |||
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium5 -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ | |||
export CCACHE_NOSTATS=1 && \ | |||
export CCACHE_DIR=/ccache && \ | |||
export CCACHE_SLOPPINESS=include_file_mtime && \ | |||
@@ -141,7 +141,7 @@ jobs: | |||
- 64 | |||
- 32 | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium4 | |||
PQCLEAN_ONLY_SCHEMES: dilithium5 | |||
runs-on: windows-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
@@ -170,7 +170,7 @@ jobs: | |||
needs: | |||
- test-native | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium4 | |||
PQCLEAN_ONLY_SCHEMES: dilithium5 | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
# XCode version |
@@ -0,0 +1,204 @@ | |||
on: | |||
push: | |||
paths: | |||
# build if tests change | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium5aes*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium5aes/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium5aes.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
pull_request: | |||
paths: | |||
# build if tests change | |||
- 'test/**' | |||
# do not build if other schemes duplicate_consistency files change | |||
- '!test/duplicate_consistency/*.yml' | |||
- 'test/duplicate_consistency/dilithium5aes*.yml' | |||
# build if common files change | |||
- 'common/**' | |||
# build if scheme changed | |||
- 'crypto_sign/dilithium5aes/**' | |||
# build if workflow file changed | |||
- '.github/workflows/sign_dilithium5aes.yml' | |||
# Build if any files in the root change, except .md files | |||
- '*' | |||
- '!*.md' | |||
schedule: | |||
- cron: '5 4 * * *' | |||
name: Test dilithium5aes | |||
jobs: | |||
test-native: | |||
runs-on: ubuntu-latest | |||
container: | |||
image: pqclean/ci-container:${{ matrix.arch }} | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium5aes | |||
CC: ccache ${{ matrix.cc }} | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_DIR: /ccache | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
strategy: | |||
matrix: | |||
arch: | |||
- amd64 | |||
- i386 | |||
cc: | |||
- gcc | |||
- clang | |||
steps: | |||
- name: Cancel Previous Runs | |||
uses: thomwiggers/cancel-workflow-action@all_but_latest | |||
with: | |||
all_but_latest: true | |||
access_token: ${{ github.token }} | |||
continue-on-error: true | |||
if: matrix.arch == 'amd64' && matrix.cc == 'gcc' | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Cache ccache | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-ccache | |||
with: | |||
path: /ccache | |||
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} | |||
- name: Cache pip | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-python-pip | |||
with: | |||
path: ~/.cache/pip | |||
key: v1-python-pip | |||
- name: Install python dependencies | |||
run: | | |||
python3 -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
cd test | |||
python3 -m pytest --verbose --numprocesses=auto | |||
test-emulated: | |||
needs: | |||
- test-native | |||
runs-on: ubuntu-latest | |||
strategy: | |||
matrix: | |||
arch: | |||
- armhf | |||
- unstable-ppc | |||
cc: | |||
- gcc | |||
- clang | |||
env: | |||
CC: ${{ matrix.cc }} | |||
steps: | |||
- name: Register qemu-user-static | |||
run: | | |||
docker run --rm --privileged multiarch/qemu-user-static:register --reset | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Cache ccache | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-ccache | |||
with: | |||
path: ~/ccache | |||
key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} | |||
- name: Cache pip | |||
uses: actions/cache@v2 | |||
env: | |||
cache-name: cache-python-pip | |||
with: | |||
path: ~/.cache/pip | |||
key: v1-python-pip | |||
- name: Run tests in container | |||
run: | | |||
docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium5aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ | |||
export CCACHE_NOSTATS=1 && \ | |||
export CCACHE_DIR=/ccache && \ | |||
export CCACHE_SLOPPINESS=include_file_mtime && \ | |||
export CC=\"ccache $CC\" && \ | |||
pip3 install -U -r requirements.txt && \ | |||
cd test && \ | |||
python3 -m pytest --verbose --numprocesses=auto" | |||
test-windows: | |||
needs: | |||
- test-native | |||
strategy: | |||
matrix: | |||
bits: | |||
- 64 | |||
- 32 | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium5aes | |||
runs-on: windows-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Setup astyle | |||
run: | | |||
# Setup strong crypto | |||
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord | |||
Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord | |||
Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe" | |||
shell: powershell | |||
- name: Setup Python | |||
uses: actions/setup-python@main | |||
with: | |||
python-version: "3.x" | |||
- name: Install python requirements | |||
run: python -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat" | |||
cd test | |||
python -m pytest --verbose --numprocesses=auto | |||
shell: cmd | |||
test-macos: | |||
needs: | |||
- test-native | |||
env: | |||
PQCLEAN_ONLY_SCHEMES: dilithium5aes | |||
CCACHE_NOSTATS: 1 | |||
CCACHE_SLOPPINESS: include_file_mtime | |||
# XCode version | |||
DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer | |||
strategy: | |||
matrix: | |||
compiler: | |||
- clang # XCode (Apple LLVM/Clang) | |||
- gcc9 # GNU (Homebrew) | |||
runs-on: macos-latest | |||
steps: | |||
- uses: actions/checkout@v2 | |||
with: | |||
submodules: true | |||
- name: Install astyle | |||
run: | | |||
brew install astyle | |||
- name: Set up GCC9 compiler | |||
run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9' | |||
if: matrix.compiler == 'gcc9' | |||
- name: Setup Python | |||
uses: actions/setup-python@main | |||
with: | |||
python-version: "3.x" | |||
- name: Install Python dependencies | |||
run: python -m pip install -U -r requirements.txt | |||
- name: Run tests | |||
run: | | |||
cd test | |||
python -m pytest --verbose --numprocesses=auto | |||
@@ -1,11 +1,11 @@ | |||
name: Dilithium2 | |||
type: signature | |||
claimed-nist-level: 1 | |||
length-public-key: 1184 | |||
length-secret-key: 2800 | |||
length-signature: 2044 | |||
nistkat-sha256: 23b7d52a268bbd8633d139b64a1b0e3263777cb2b074f7af0a7fd315afe94d18 | |||
testvectors-sha256: d647039ae7e1785414c64934d5ae37518f259acab95d6a6e873e9b6d3ad63dfd | |||
claimed-nist-level: 2 | |||
length-public-key: 1312 | |||
length-secret-key: 2544 | |||
length-signature: 2420 | |||
nistkat-sha256: 9c636528bf81c03df6ad8f9471cb1b4d9097d66af825d4f60b7ff0d941ca4d37 | |||
testvectors-sha256: 166fc2481358d5a1b7a528b30af36ad069b049b5755cf63b843ce0f25f35aeb6 | |||
principal-submitters: | |||
- Vadim Lyubashevsky | |||
auxiliary-submitters: | |||
@@ -17,15 +17,15 @@ auxiliary-submitters: | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Darwin | |||
- Linux | |||
required_flags: | |||
- avx2 | |||
- bmi1 | |||
- popcnt | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- aes | |||
- avx2 | |||
- popcnt |
@@ -1,6 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and the random number generator | |||
we are using public-domain code from sources | |||
and by authors listed in comments on top of | |||
the respective files. | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -1,34 +1,27 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libdilithium2_avx2.a | |||
SOURCES = fips202x4.c invntt.S nttconsts.c ntt.S packing.c pointwise.S poly.c \ | |||
polyvec.c reduce.S rejsample.c rounding.c sign.c stream.c | |||
OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ | |||
polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o | |||
HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ | |||
nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ | |||
fips202x4.h shuffle.inc cdecl.inc | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ | |||
-Wmissing-prototypes -Wredundant-decls -std=c99 \ | |||
-Wcast-align -Werror=shadow\ | |||
-mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
HEADERS=align.h api.h cdecl.h consts.h fips202x4.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc | |||
OBJECTS=consts.o fips202x4.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o symmetric-shake.o f1600x4.o invntt.o ntt.o pointwise.o shuffle.o | |||
KECCAK4XDIR=../../../common/keccak4x | |||
KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o | |||
KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) | |||
CFLAGS=-mavx2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ | |||
-Wmissing-prototypes -Wredundant-decls \ | |||
-Wpointer-arith -Wshadow \ | |||
-std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
%.o: %.S $(HEADERS) | |||
$(CC) -c -o $@ $< | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) $(KECCAK4X) | |||
$(AR) -r $@ $^ | |||
$(AR) -r $@ $(OBJECTS) $(KECCAK4X) | |||
$(KECCAK4X): | |||
$(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) | |||
@@ -36,5 +29,3 @@ $(KECCAK4X): | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) | |||
$(MAKE) -C $(KECCAK4XDIR) clean | |||
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGN_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[N]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGNED_INT32(N) \ | |||
union { \ | |||
int32_t coeffs[N]; \ | |||
__m256i vec[((N)+7)/8]; \ | |||
} | |||
#endif |
@@ -1,22 +0,0 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint32_t as_arr[N]; \ | |||
__m256i as_vec[(N)/32]; \ | |||
} | |||
#define ALIGNED_UINT32(N) \ | |||
union { \ | |||
uint32_t as_arr[N]; \ | |||
__m256i as_vec[(N)/8]; \ | |||
} | |||
#define ALIGNED_UINT64(N) \ | |||
union { \ | |||
uint64_t as_arr[N]; \ | |||
__m256i as_vec[(N)/8]; \ | |||
} | |||
#endif //PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H |
@@ -4,26 +4,13 @@ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1184U | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2800U | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2044U | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1312 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2544 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2420 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair( | |||
uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *msg, size_t len, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
@@ -33,6 +20,12 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -1,5 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL | |||
#define PQCLEAN_DILITHIUM4_AVX2_CDECL | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_CDECL_H | |||
#define _8XQ 0 | |||
#define _8XQINV 8 | |||
#define _8XDIV_QINV 16 | |||
#define _8XDIV 24 | |||
#define _ZETAS_QINV 32 | |||
#define _ZETAS 328 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
@@ -9,10 +18,7 @@ | |||
* This define helps us get around this | |||
*/ | |||
#if defined(__WIN32__) || defined(__APPLE__) | |||
#define cdecl(s) _##s | |||
#else | |||
#define _cdecl(s) _##s | |||
#define cdecl(s) s | |||
#endif | |||
#endif |
@@ -0,0 +1,101 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
#define MONT (-4186625) // 2^32 mod q | |||
#define DIV 41978 // mont^2/256 | |||
#define DIV_QINV (-8395782) | |||
const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata = {{ | |||
//#define _8XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, | |||
//#define _8XQINV 8 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
//#define _8XDIV_QINV 16 | |||
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, | |||
//#define _8XDIV 24 | |||
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, | |||
//#define _ZETAS_QINV 32 | |||
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, | |||
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, | |||
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, | |||
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, | |||
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, | |||
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, | |||
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, | |||
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, | |||
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, | |||
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, | |||
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, | |||
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, | |||
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, | |||
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, | |||
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, | |||
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, | |||
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, | |||
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, | |||
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, | |||
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, | |||
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, | |||
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, | |||
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, | |||
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, | |||
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, | |||
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, | |||
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, | |||
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, | |||
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, | |||
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, | |||
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, | |||
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, | |||
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, | |||
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, | |||
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, | |||
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, | |||
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, | |||
//#define _ZETAS 328 | |||
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, | |||
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, | |||
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, | |||
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, | |||
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, | |||
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, | |||
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, | |||
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, | |||
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, | |||
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, | |||
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, | |||
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, | |||
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, | |||
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, | |||
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, | |||
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, | |||
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, | |||
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, | |||
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, | |||
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, | |||
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, | |||
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, | |||
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, | |||
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, | |||
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, | |||
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, | |||
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, | |||
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, | |||
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, | |||
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, | |||
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, | |||
} | |||
}; |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CONSTS_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_CONSTS_H | |||
#include "align.h" | |||
#include "cdecl.h" | |||
typedef ALIGNED_INT32(624) qdata_t; | |||
extern const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata; | |||
#endif |
@@ -0,0 +1,909 @@ | |||
/* Taken from Bas Westerbaan's new 4-way SHAKE implementation | |||
* for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), | |||
* but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ | |||
#include "cdecl.h" | |||
.data | |||
.p2align 5 | |||
rho8: | |||
.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 | |||
rho56: | |||
.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4): | |||
vmovdqa rho8(%rip), %ymm0 | |||
movq $6, %rax | |||
looptop: | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 192(%rdi), %ymm4, %ymm9 | |||
vpxor 384(%rdi), %ymm3, %ymm10 | |||
vpxor 576(%rdi), %ymm2, %ymm11 | |||
vpxor 768(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 0(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 96(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 320(%rdi), %ymm5, %ymm10 | |||
vpxor 512(%rdi), %ymm4, %ymm11 | |||
vpxor 704(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 32(%rdi), %ymm4, %ymm8 | |||
vpxor 224(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 608(%rdi), %ymm1, %ymm11 | |||
vpxor 640(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 128(%rdi), %ymm1, %ymm8 | |||
vpxor 160(%rdi), %ymm5, %ymm9 | |||
vpxor 352(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 736(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 64(%rdi), %ymm3, %ymm8 | |||
vpxor 256(%rdi), %ymm2, %ymm9 | |||
vpxor 448(%rdi), %ymm1, %ymm10 | |||
vpxor 480(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 512(%rdi), %ymm4, %ymm9 | |||
vpxor 224(%rdi), %ymm3, %ymm10 | |||
vpxor 736(%rdi), %ymm2, %ymm11 | |||
vpxor 448(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 8(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 576(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 640(%rdi), %ymm5, %ymm10 | |||
vpxor 352(%rdi), %ymm4, %ymm11 | |||
vpxor 64(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 192(%rdi), %ymm4, %ymm8 | |||
vpxor 704(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 128(%rdi), %ymm1, %ymm11 | |||
vpxor 480(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 768(%rdi), %ymm1, %ymm8 | |||
vpxor 320(%rdi), %ymm5, %ymm9 | |||
vpxor 32(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 256(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 384(%rdi), %ymm3, %ymm8 | |||
vpxor 96(%rdi), %ymm2, %ymm9 | |||
vpxor 608(%rdi), %ymm1, %ymm10 | |||
vpxor 160(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 352(%rdi), %ymm4, %ymm9 | |||
vpxor 704(%rdi), %ymm3, %ymm10 | |||
vpxor 256(%rdi), %ymm2, %ymm11 | |||
vpxor 608(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 16(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 736(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 480(%rdi), %ymm5, %ymm10 | |||
vpxor 32(%rdi), %ymm4, %ymm11 | |||
vpxor 384(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 512(%rdi), %ymm4, %ymm8 | |||
vpxor 64(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 768(%rdi), %ymm1, %ymm11 | |||
vpxor 160(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
vpxor 448(%rdi), %ymm1, %ymm8 | |||
vpxor 640(%rdi), %ymm5, %ymm9 | |||
vpxor 192(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 96(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 224(%rdi), %ymm3, %ymm8 | |||
vpxor 576(%rdi), %ymm2, %ymm9 | |||
vpxor 128(%rdi), %ymm1, %ymm10 | |||
vpxor 320(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vmovdqa 0(%rdi), %ymm8 | |||
vmovdqa 32(%rdi), %ymm9 | |||
vmovdqa 64(%rdi), %ymm10 | |||
vmovdqa 96(%rdi), %ymm11 | |||
vmovdqa 128(%rdi), %ymm12 | |||
vpxor 160(%rdi), %ymm8, %ymm8 | |||
vpxor 192(%rdi), %ymm9, %ymm9 | |||
vpxor 224(%rdi), %ymm10, %ymm10 | |||
vpxor 256(%rdi), %ymm11, %ymm11 | |||
vpxor 288(%rdi), %ymm12, %ymm12 | |||
vpxor 320(%rdi), %ymm8, %ymm8 | |||
vpxor 352(%rdi), %ymm9, %ymm9 | |||
vpxor 384(%rdi), %ymm10, %ymm10 | |||
vpxor 416(%rdi), %ymm11, %ymm11 | |||
vpxor 448(%rdi), %ymm12, %ymm12 | |||
vpxor 480(%rdi), %ymm8, %ymm8 | |||
vpxor 512(%rdi), %ymm9, %ymm9 | |||
vpxor 544(%rdi), %ymm10, %ymm10 | |||
vpxor 576(%rdi), %ymm11, %ymm11 | |||
vpxor 608(%rdi), %ymm12, %ymm12 | |||
vpxor 640(%rdi), %ymm8, %ymm8 | |||
vpxor 672(%rdi), %ymm9, %ymm9 | |||
vpxor 704(%rdi), %ymm10, %ymm10 | |||
vpxor 736(%rdi), %ymm11, %ymm11 | |||
vpxor 768(%rdi), %ymm12, %ymm12 | |||
vpsllq $1, %ymm9, %ymm13 | |||
vpsllq $1, %ymm10, %ymm14 | |||
vpsllq $1, %ymm11, %ymm15 | |||
vpsllq $1, %ymm12, %ymm7 | |||
vpsllq $1, %ymm8, %ymm6 | |||
vpsrlq $63, %ymm9, %ymm5 | |||
vpsrlq $63, %ymm10, %ymm4 | |||
vpsrlq $63, %ymm11, %ymm3 | |||
vpsrlq $63, %ymm12, %ymm2 | |||
vpsrlq $63, %ymm8, %ymm1 | |||
vpor %ymm13, %ymm5, %ymm5 | |||
vpor %ymm14, %ymm4, %ymm4 | |||
vpor %ymm15, %ymm3, %ymm3 | |||
vpor %ymm7, %ymm2, %ymm2 | |||
vpor %ymm6, %ymm1, %ymm1 | |||
vpxor %ymm5, %ymm12, %ymm5 | |||
vpxor %ymm4, %ymm8, %ymm4 | |||
vpxor %ymm3, %ymm9, %ymm3 | |||
vpxor %ymm2, %ymm10, %ymm2 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor 0(%rdi), %ymm5, %ymm8 | |||
vpxor 32(%rdi), %ymm4, %ymm9 | |||
vpxor 64(%rdi), %ymm3, %ymm10 | |||
vpxor 96(%rdi), %ymm2, %ymm11 | |||
vpxor 128(%rdi), %ymm1, %ymm12 | |||
vpsllq $44, %ymm9, %ymm14 | |||
vpsllq $43, %ymm10, %ymm15 | |||
vpsllq $21, %ymm11, %ymm7 | |||
vpsllq $14, %ymm12, %ymm6 | |||
vpsrlq $20, %ymm9, %ymm9 | |||
vpsrlq $21, %ymm10, %ymm10 | |||
vpsrlq $43, %ymm11, %ymm11 | |||
vpsrlq $50, %ymm12, %ymm12 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vpbroadcastq 24(%rsi), %ymm8 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vmovdqa %ymm13, 0(%rdi) | |||
vmovdqa %ymm14, 32(%rdi) | |||
vmovdqa %ymm15, 64(%rdi) | |||
vmovdqa %ymm7, 96(%rdi) | |||
vmovdqa %ymm6, 128(%rdi) | |||
vpxor 256(%rdi), %ymm2, %ymm8 | |||
vpxor 288(%rdi), %ymm1, %ymm9 | |||
vpxor 160(%rdi), %ymm5, %ymm10 | |||
vpxor 192(%rdi), %ymm4, %ymm11 | |||
vpxor 224(%rdi), %ymm3, %ymm12 | |||
vpsllq $28, %ymm8, %ymm13 | |||
vpsllq $20, %ymm9, %ymm14 | |||
vpsllq $3, %ymm10, %ymm15 | |||
vpsllq $45, %ymm11, %ymm7 | |||
vpsllq $61, %ymm12, %ymm6 | |||
vpsrlq $36, %ymm8, %ymm8 | |||
vpsrlq $44, %ymm9, %ymm9 | |||
vpsrlq $61, %ymm10, %ymm10 | |||
vpsrlq $19, %ymm11, %ymm11 | |||
vpsrlq $3, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 160(%rdi) | |||
vmovdqa %ymm14, 192(%rdi) | |||
vmovdqa %ymm15, 224(%rdi) | |||
vmovdqa %ymm7, 256(%rdi) | |||
vmovdqa %ymm6, 288(%rdi) | |||
vpxor 352(%rdi), %ymm4, %ymm8 | |||
vpxor 384(%rdi), %ymm3, %ymm9 | |||
vpxor 416(%rdi), %ymm2, %ymm10 | |||
vpxor 448(%rdi), %ymm1, %ymm11 | |||
vpxor 320(%rdi), %ymm5, %ymm12 | |||
vpsllq $1, %ymm8, %ymm13 | |||
vpsllq $6, %ymm9, %ymm14 | |||
vpsllq $25, %ymm10, %ymm15 | |||
#vpsllq $8, %ymm11, %ymm7 | |||
vpsllq $18, %ymm12, %ymm6 | |||
vpsrlq $63, %ymm8, %ymm8 | |||
vpsrlq $58, %ymm9, %ymm9 | |||
vpsrlq $39, %ymm10, %ymm10 | |||
#vpsrlq $56, %ymm11, %ymm11 | |||
vpsrlq $46, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
#vpor %ymm7, %ymm11, %ymm11 | |||
vpshufb %ymm0, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 320(%rdi) | |||
vmovdqa %ymm14, 352(%rdi) | |||
vmovdqa %ymm15, 384(%rdi) | |||
vmovdqa %ymm7, 416(%rdi) | |||
vmovdqa %ymm6, 448(%rdi) | |||
vpxor 608(%rdi), %ymm1, %ymm8 | |||
vpxor 480(%rdi), %ymm5, %ymm9 | |||
vpxor 512(%rdi), %ymm4, %ymm10 | |||
vpxor 544(%rdi), %ymm3, %ymm11 | |||
vpxor 576(%rdi), %ymm2, %ymm12 | |||
vpsllq $27, %ymm8, %ymm13 | |||
vpsllq $36, %ymm9, %ymm14 | |||
vpsllq $10, %ymm10, %ymm15 | |||
vpsllq $15, %ymm11, %ymm7 | |||
#vpsllq $56, %ymm12, %ymm6 | |||
vpsrlq $37, %ymm8, %ymm8 | |||
vpsrlq $28, %ymm9, %ymm9 | |||
vpsrlq $54, %ymm10, %ymm10 | |||
vpsrlq $49, %ymm11, %ymm11 | |||
#vpsrlq $8, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
#vpor %ymm6, %ymm12, %ymm12 | |||
vpshufb rho56(%rip), %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 480(%rdi) | |||
vmovdqa %ymm14, 512(%rdi) | |||
vmovdqa %ymm15, 544(%rdi) | |||
vmovdqa %ymm7, 576(%rdi) | |||
vmovdqa %ymm6, 608(%rdi) | |||
vpxor 704(%rdi), %ymm3, %ymm8 | |||
vpxor 736(%rdi), %ymm2, %ymm9 | |||
vpxor 768(%rdi), %ymm1, %ymm10 | |||
vpxor 640(%rdi), %ymm5, %ymm11 | |||
vpxor 672(%rdi), %ymm4, %ymm12 | |||
vpsllq $62, %ymm8, %ymm13 | |||
vpsllq $55, %ymm9, %ymm14 | |||
vpsllq $39, %ymm10, %ymm15 | |||
vpsllq $41, %ymm11, %ymm7 | |||
vpsllq $2, %ymm12, %ymm6 | |||
vpsrlq $2, %ymm8, %ymm8 | |||
vpsrlq $9, %ymm9, %ymm9 | |||
vpsrlq $25, %ymm10, %ymm10 | |||
vpsrlq $23, %ymm11, %ymm11 | |||
vpsrlq $62, %ymm12, %ymm12 | |||
vpor %ymm13, %ymm8, %ymm8 | |||
vpor %ymm14, %ymm9, %ymm9 | |||
vpor %ymm15, %ymm10, %ymm10 | |||
vpor %ymm7, %ymm11, %ymm11 | |||
vpor %ymm6, %ymm12, %ymm12 | |||
vpandn %ymm10, %ymm9, %ymm13 | |||
vpandn %ymm11, %ymm10, %ymm14 | |||
vpandn %ymm12, %ymm11, %ymm15 | |||
vpandn %ymm8, %ymm12, %ymm7 | |||
vpandn %ymm9, %ymm8, %ymm6 | |||
vpxor %ymm8, %ymm13, %ymm13 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm10, %ymm15, %ymm15 | |||
vpxor %ymm11, %ymm7, %ymm7 | |||
vpxor %ymm12, %ymm6, %ymm6 | |||
vmovdqa %ymm13, 640(%rdi) | |||
vmovdqa %ymm14, 672(%rdi) | |||
vmovdqa %ymm15, 704(%rdi) | |||
vmovdqa %ymm7, 736(%rdi) | |||
vmovdqa %ymm6, 768(%rdi) | |||
addq $32, %rsi | |||
subq $1, %rax | |||
jnz looptop | |||
ret |
@@ -1,233 +1,219 @@ | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include "fips202.h" | |||
#include "fips202x4.h" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define NROUNDS 24 | |||
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) | |||
static uint64_t load64(const uint8_t *x) { | |||
uint64_t r = 0; | |||
for (size_t i = 0; i < 8; ++i) { | |||
r |= (uint64_t)x[i] << 8 * i; | |||
} | |||
return r; | |||
} | |||
static void store64(uint8_t *x, uint64_t u) { | |||
for (size_t i = 0; i < 8; ++i) { | |||
x[i] = (uint8_t)(u >> 8 * i); | |||
} | |||
} | |||
/* Use implementation from the Keccak Code Package */ | |||
extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); | |||
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds | |||
static void keccak_absorb4x(__m256i *s, | |||
uint8_t r, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen, | |||
uint8_t p) { | |||
/* Keccak round constants */ | |||
static const uint64_t KeccakF_RoundConstants[NROUNDS] = { | |||
(uint64_t)0x0000000000000001ULL, | |||
(uint64_t)0x0000000000008082ULL, | |||
(uint64_t)0x800000000000808aULL, | |||
(uint64_t)0x8000000080008000ULL, | |||
(uint64_t)0x000000000000808bULL, | |||
(uint64_t)0x0000000080000001ULL, | |||
(uint64_t)0x8000000080008081ULL, | |||
(uint64_t)0x8000000000008009ULL, | |||
(uint64_t)0x000000000000008aULL, | |||
(uint64_t)0x0000000000000088ULL, | |||
(uint64_t)0x0000000080008009ULL, | |||
(uint64_t)0x000000008000000aULL, | |||
(uint64_t)0x000000008000808bULL, | |||
(uint64_t)0x800000000000008bULL, | |||
(uint64_t)0x8000000000008089ULL, | |||
(uint64_t)0x8000000000008003ULL, | |||
(uint64_t)0x8000000000008002ULL, | |||
(uint64_t)0x8000000000000080ULL, | |||
(uint64_t)0x000000000000800aULL, | |||
(uint64_t)0x800000008000000aULL, | |||
(uint64_t)0x8000000080008081ULL, | |||
(uint64_t)0x8000000000008080ULL, | |||
(uint64_t)0x0000000080000001ULL, | |||
(uint64_t)0x8000000080008008ULL | |||
}; | |||
static void keccakx4_absorb_once(__m256i s[25], | |||
unsigned int r, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen, | |||
uint8_t p) { | |||
size_t i; | |||
uint8_t t0[200]; | |||
uint8_t t1[200]; | |||
uint8_t t2[200]; | |||
uint8_t t3[200]; | |||
uint64_t *ss = (uint64_t *)s; | |||
uint64_t pos = 0; | |||
__m256i t, idx; | |||
for (i = 0; i < 25; ++i) { | |||
s[i] = _mm256_xor_si256(s[i], s[i]); | |||
s[i] = _mm256_setzero_si256(); | |||
} | |||
while (mlen >= r) { | |||
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); | |||
while (inlen >= r) { | |||
for (i = 0; i < r / 8; ++i) { | |||
ss[4 * i + 0] ^= load64(m0 + 8 * i); | |||
ss[4 * i + 1] ^= load64(m1 + 8 * i); | |||
ss[4 * i + 2] ^= load64(m2 + 8 * i); | |||
ss[4 * i + 3] ^= load64(m3 + 8 * i); | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= r; | |||
KeccakF1600_StatePermute4x(s); | |||
mlen -= r; | |||
m0 += r; | |||
m1 += r; | |||
m2 += r; | |||
m3 += r; | |||
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); | |||
} | |||
for (i = 0; i < r; ++i) { | |||
t0[i] = 0; | |||
t1[i] = 0; | |||
t2[i] = 0; | |||
t3[i] = 0; | |||
} | |||
for (i = 0; i < mlen; ++i) { | |||
t0[i] = m0[i]; | |||
t1[i] = m1[i]; | |||
t2[i] = m2[i]; | |||
t3[i] = m3[i]; | |||
for (i = 0; i < inlen / 8; ++i) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= 8 * i; | |||
t0[i] = p; | |||
t1[i] = p; | |||
t2[i] = p; | |||
t3[i] = p; | |||
t0[r - 1] |= 128; | |||
t1[r - 1] |= 128; | |||
t2[r - 1] |= 128; | |||
t3[r - 1] |= 128; | |||
for (i = 0; i < r / 8; ++i) { | |||
ss[4 * i + 0] ^= load64(t0 + 8 * i); | |||
ss[4 * i + 1] ^= load64(t1 + 8 * i); | |||
ss[4 * i + 2] ^= load64(t2 + 8 * i); | |||
ss[4 * i + 3] ^= load64(t3 + 8 * i); | |||
if (inlen) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); | |||
t = _mm256_and_si256(t, idx); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
} | |||
} | |||
t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
t = _mm256_set1_epi64x((long long)(1ULL << 63)); | |||
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); | |||
} | |||
static void keccak_squeezeblocks4x(uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
static void keccakx4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
uint8_t r, | |||
__m256i *s) { | |||
uint64_t *ss = (uint64_t *)s; | |||
unsigned int r, | |||
__m256i s[25]) { | |||
unsigned int i; | |||
__m128d t; | |||
while (nblocks > 0) { | |||
KeccakF1600_StatePermute4x(s); | |||
for (size_t i = 0; i < r / 8; ++i) { | |||
store64(h0 + 8 * i, ss[4 * i + 0]); | |||
store64(h1 + 8 * i, ss[4 * i + 1]); | |||
store64(h2 + 8 * i, ss[4 * i + 2]); | |||
store64(h3 + 8 * i, ss[4 * i + 3]); | |||
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); | |||
for (i = 0; i < r / 8; ++i) { | |||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | |||
_mm_storel_pd((double *)&out0[8 * i], t); | |||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | |||
_mm_storel_pd((double *)&out2[8 * i], t); | |||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||
} | |||
h0 += r; | |||
h1 += r; | |||
h2 += r; | |||
h3 += r; | |||
out0 += r; | |||
out1 += r; | |||
out2 += r; | |||
out3 += r; | |||
--nblocks; | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x( | |||
__m256i *s, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen) { | |||
keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t nblocks, | |||
__m256i *s) { | |||
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x( | |||
__m256i *s, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen) { | |||
keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t nblocks, | |||
__m256i *s) { | |||
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t hlen, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen) { | |||
size_t nblocks = hlen / SHAKE128_RATE; | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
unsigned int i; | |||
size_t nblocks = outlen / SHAKE128_RATE; | |||
uint8_t t[4][SHAKE128_RATE]; | |||
__m256i s[25]; | |||
PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); | |||
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); | |||
h0 += nblocks * SHAKE128_RATE; | |||
h1 += nblocks * SHAKE128_RATE; | |||
h2 += nblocks * SHAKE128_RATE; | |||
h3 += nblocks * SHAKE128_RATE; | |||
hlen -= nblocks * SHAKE128_RATE; | |||
if (hlen) { | |||
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); | |||
for (size_t i = 0; i < hlen; ++i) { | |||
h0[i] = t[0][i]; | |||
h1[i] = t[1][i]; | |||
h2[i] = t[2][i]; | |||
h3[i] = t[3][i]; | |||
keccakx4_state state; | |||
PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE128_RATE; | |||
out1 += nblocks * SHAKE128_RATE; | |||
out2 += nblocks * SHAKE128_RATE; | |||
out3 += nblocks * SHAKE128_RATE; | |||
outlen -= nblocks * SHAKE128_RATE; | |||
if (outlen) { | |||
PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t hlen, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen) { | |||
size_t nblocks = hlen / SHAKE256_RATE; | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
unsigned int i; | |||
size_t nblocks = outlen / SHAKE256_RATE; | |||
uint8_t t[4][SHAKE256_RATE]; | |||
__m256i s[25]; | |||
PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); | |||
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); | |||
h0 += nblocks * SHAKE256_RATE; | |||
h1 += nblocks * SHAKE256_RATE; | |||
h2 += nblocks * SHAKE256_RATE; | |||
h3 += nblocks * SHAKE256_RATE; | |||
hlen -= nblocks * SHAKE256_RATE; | |||
if (hlen) { | |||
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); | |||
for (size_t i = 0; i < hlen; ++i) { | |||
h0[i] = t[0][i]; | |||
h1[i] = t[1][i]; | |||
h2[i] = t[2][i]; | |||
h3[i] = t[3][i]; | |||
keccakx4_state state; | |||
PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE256_RATE; | |||
out1 += nblocks * SHAKE256_RATE; | |||
out2 += nblocks * SHAKE256_RATE; | |||
out3 += nblocks * SHAKE256_RATE; | |||
outlen -= nblocks * SHAKE256_RATE; | |||
if (outlen) { | |||
PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} |
@@ -5,62 +5,60 @@ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "params.h" | |||
typedef struct { | |||
__m256i s[25]; | |||
} keccakx4_state; | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x( | |||
__m256i *s, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_f1600x4(__m256i *s, const uint64_t *rc); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t nblocks, | |||
__m256i *s); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x( | |||
__m256i *s, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t nblocks, | |||
__m256i *s); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t hlen, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_4x( | |||
uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
size_t hlen, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen); | |||
#endif |
@@ -1,282 +1,240 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
#include "cdecl.inc" | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | |||
vpaddd %ymm2,%ymm\l0,%ymm12 | |||
vpaddd %ymm2,%ymm\l1,%ymm13 | |||
vpaddd %ymm2,%ymm\l2,%ymm14 | |||
vpsubd %ymm\h0,%ymm12,%ymm12 | |||
vpsubd %ymm\h1,%ymm13,%ymm13 | |||
vpsubd %ymm\h2,%ymm14,%ymm14 | |||
vpmuludq %ymm\z0,%ymm12,%ymm12 | |||
vpmuludq %ymm\z0,%ymm13,%ymm13 | |||
vpaddd %ymm2,%ymm\l3,%ymm15 | |||
vpmuludq %ymm\z1,%ymm14,%ymm14 | |||
vpsubd %ymm\h3,%ymm15,%ymm15 | |||
vpaddd %ymm\l0,%ymm\h0,%ymm\l0 | |||
vpmuludq %ymm\z1,%ymm15,%ymm15 | |||
vpaddd %ymm\l1,%ymm\h1,%ymm\l1 | |||
vpaddd %ymm\l2,%ymm\h2,%ymm\l2 | |||
vpaddd %ymm\l3,%ymm\h3,%ymm\l3 | |||
vpmuludq %ymm0,%ymm12,%ymm\h0 | |||
vpmuludq %ymm0,%ymm13,%ymm\h1 | |||
vpmuludq %ymm0,%ymm14,%ymm\h2 | |||
vpmuludq %ymm0,%ymm15,%ymm\h3 | |||
vpmuludq %ymm1,%ymm\h0,%ymm\h0 | |||
vpmuludq %ymm1,%ymm\h1,%ymm\h1 | |||
vpmuludq %ymm1,%ymm\h2,%ymm\h2 | |||
vpmuludq %ymm1,%ymm\h3,%ymm\h3 | |||
vpaddq %ymm12,%ymm\h0,%ymm\h0 | |||
vpaddq %ymm13,%ymm\h1,%ymm\h1 | |||
vpaddq %ymm14,%ymm\h2,%ymm\h2 | |||
vpaddq %ymm15,%ymm\h3,%ymm\h3 | |||
vpsrlq $32,%ymm\h0,%ymm\h0 | |||
vpsrlq $32,%ymm\h1,%ymm\h1 | |||
vpsrlq $32,%ymm\h2,%ymm\h2 | |||
vpsrlq $32,%ymm\h3,%ymm\h3 | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpsubd %ymm\l,%ymm\h,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vpmuldq %ymm\zl0,%ymm12,%ymm13 | |||
vmovshdup %ymm12,%ymm\h | |||
vpmuldq %ymm\zl1,%ymm\h,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm12,%ymm12 | |||
vpmuldq %ymm\zh1,%ymm\h,%ymm\h | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpsubd %ymm13,%ymm12,%ymm12 | |||
vpsubd %ymm14,%ymm\h,%ymm\h | |||
vmovshdup %ymm12,%ymm12 | |||
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm6 | |||
vmovdqa 32(%rsi),%ymm7 | |||
vmovdqa 64(%rsi),%ymm5 | |||
vmovdqa 96(%rsi),%ymm10 | |||
#reorder | |||
shuffle8 6,5,8,5 | |||
shuffle8 7,10,6,10 | |||
shuffle4 8,6,4,6 | |||
shuffle4 5,10,8,10 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vpsrlq $32,%ymm6,%ymm7 | |||
vpsrlq $32,%ymm8,%ymm9 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
level0: | |||
vpmovzxdq (%rdx),%ymm3 | |||
vpmovzxdq 16(%rdx),%ymm15 | |||
vpaddd %ymm2,%ymm4,%ymm12 | |||
vpaddd %ymm2,%ymm6,%ymm13 | |||
vpaddd %ymm2,%ymm8,%ymm14 | |||
vpsubd %ymm5,%ymm12,%ymm12 | |||
vpsubd %ymm7,%ymm13,%ymm13 | |||
vpsubd %ymm9,%ymm14,%ymm14 | |||
vpmuludq %ymm3,%ymm12,%ymm12 | |||
vpmuludq %ymm15,%ymm13,%ymm13 | |||
vpaddd %ymm2,%ymm10,%ymm15 | |||
vpsubd %ymm11,%ymm15,%ymm15 | |||
vpaddd %ymm4,%ymm5,%ymm4 | |||
vpaddd %ymm6,%ymm7,%ymm6 | |||
vpmovzxdq 32(%rdx),%ymm5 | |||
vpmovzxdq 48(%rdx),%ymm7 | |||
vpmuludq %ymm5,%ymm14,%ymm14 | |||
vpmuludq %ymm7,%ymm15,%ymm15 | |||
vpaddd %ymm8,%ymm9,%ymm8 | |||
vpaddd %ymm10,%ymm11,%ymm10 | |||
vpmuludq %ymm0,%ymm12,%ymm5 | |||
vpmuludq %ymm0,%ymm13,%ymm7 | |||
vpmuludq %ymm0,%ymm14,%ymm9 | |||
vpmuludq %ymm0,%ymm15,%ymm11 | |||
vpmuludq %ymm1,%ymm5,%ymm5 | |||
vpmuludq %ymm1,%ymm7,%ymm7 | |||
vpmuludq %ymm1,%ymm9,%ymm9 | |||
vpmuludq %ymm1,%ymm11,%ymm11 | |||
vpaddq %ymm12,%ymm5,%ymm5 | |||
vpaddq %ymm13,%ymm7,%ymm7 | |||
vpaddq %ymm14,%ymm9,%ymm9 | |||
vpaddq %ymm15,%ymm11,%ymm11 | |||
vpsrlq $32,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm9,%ymm9 | |||
vpsrlq $32,%ymm11,%ymm11 | |||
level1: | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||
vpmovzxdq 64(%rdx),%ymm15 | |||
vpmovzxdq 80(%rdx),%ymm3 | |||
butterfly 4,5,8,9,6,7,10,11 | |||
level2: | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||
vpmovzxdq 96(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#shuffle | |||
shuffle4 4,5,3,5 | |||
shuffle4 6,7,4,7 | |||
shuffle4 8,9,6,9 | |||
shuffle4 10,11,8,11 | |||
level3: | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||
vpbroadcastd 112(%rdx),%ymm14 | |||
vpbroadcastd 116(%rdx),%ymm15 | |||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | |||
butterfly 3,4,6,8,5,7,9,11,10,10 | |||
#shuffle | |||
shuffle8 3,4,10,4 | |||
shuffle8 6,8,3,8 | |||
shuffle8 5,7,6,7 | |||
shuffle8 9,11,5,11 | |||
level4: | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||
vpbroadcastd 120(%rdx),%ymm9 | |||
butterfly 10,3,6,5,4,8,7,11,9,9 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
vmovdqa %ymm3,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm5,96(%rdi) | |||
vmovdqa %ymm4,128(%rdi) | |||
vmovdqa %ymm8,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
.macro levels0t5 off | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
/* level 0 */ | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,5,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 6,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,9,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 10,11,1,3,2,15 | |||
/* level 1 */ | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,6,1,3,2,15 | |||
butterfly 5,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,10,1,3,2,15 | |||
butterfly 9,11,1,3,2,15 | |||
/* level 2 */ | |||
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,8,1,3,2,15 | |||
butterfly 5,9,1,3,2,15 | |||
butterfly 6,10,1,3,2,15 | |||
butterfly 7,11,1,3,2,15 | |||
/* level 3 */ | |||
shuffle2 4,5,3,5 | |||
shuffle2 6,7,4,7 | |||
shuffle2 8,9,6,9 | |||
shuffle2 10,11,8,11 | |||
vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 4,7 | |||
butterfly 6,9 | |||
butterfly 8,11 | |||
/* level 4 */ | |||
shuffle4 3,4,10,4 | |||
shuffle4 6,8,3,8 | |||
shuffle4 5,7,6,7 | |||
shuffle4 9,11,5,11 | |||
vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 10,4 | |||
butterfly 3,8 | |||
butterfly 6,7 | |||
butterfly 5,11 | |||
/* level 5 */ | |||
shuffle8 10,3,9,3 | |||
shuffle8 6,5,10,5 | |||
shuffle8 4,8,6,8 | |||
shuffle8 7,11,4,11 | |||
vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 | |||
butterfly 9,3 | |||
butterfly 10,5 | |||
butterfly 6,8 | |||
butterfly 4,11 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm10,256*\off+ 32(%rdi) | |||
vmovdqa %ymm6,256*\off+ 64(%rdi) | |||
vmovdqa %ymm4,256*\off+ 96(%rdi) | |||
vmovdqa %ymm3,256*\off+128(%rdi) | |||
vmovdqa %ymm5,256*\off+160(%rdi) | |||
vmovdqa %ymm8,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
ret | |||
.macro levels6t7 off | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
/* level 6 */ | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
/* level 7 */ | |||
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 | |||
vmovdqa (_8XDIV)*4(%rsi),%ymm2 | |||
vpmuldq %ymm1,%ymm4,%ymm12 | |||
vpmuldq %ymm1,%ymm5,%ymm13 | |||
vmovshdup %ymm4,%ymm8 | |||
vmovshdup %ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm4,%ymm4 | |||
vpmuldq %ymm2,%ymm5,%ymm5 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm4,%ymm4 | |||
vpsubd %ymm13,%ymm5,%ymm5 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm4,%ymm4 | |||
vmovshdup %ymm5,%ymm5 | |||
vpblendd $0xAA,%ymm8,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm9,%ymm5,%ymm5 | |||
vpmuldq %ymm1,%ymm6,%ymm12 | |||
vpmuldq %ymm1,%ymm7,%ymm13 | |||
vmovshdup %ymm6,%ymm8 | |||
vmovshdup %ymm7,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm6,%ymm6 | |||
vpmuldq %ymm2,%ymm7,%ymm7 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm6,%ymm6 | |||
vpsubd %ymm13,%ymm7,%ymm7 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm6,%ymm6 | |||
vmovshdup %ymm7,%ymm7 | |||
vpblendd $0xAA,%ymm8,%ymm6,%ymm6 | |||
vpblendd $0xAA,%ymm9,%ymm7,%ymm7 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
vmovdqa 256(%rsi),%ymm5 | |||
vmovdqa 512(%rsi),%ymm6 | |||
vmovdqa 768(%rsi),%ymm7 | |||
vmovdqa 1024(%rsi),%ymm8 | |||
vmovdqa 1280(%rsi),%ymm9 | |||
vmovdqa 1536(%rsi),%ymm10 | |||
vmovdqa 1792(%rsi),%ymm11 | |||
level5: | |||
vpbroadcastd (%rdx),%ymm3 | |||
vpbroadcastd 4(%rdx),%ymm15 | |||
vpaddd %ymm2,%ymm4,%ymm12 | |||
vpaddd %ymm2,%ymm6,%ymm13 | |||
vpaddd %ymm2,%ymm8,%ymm14 | |||
vpsubd %ymm5,%ymm12,%ymm12 | |||
vpsubd %ymm7,%ymm13,%ymm13 | |||
vpsubd %ymm9,%ymm14,%ymm14 | |||
vpmuludq %ymm3,%ymm12,%ymm12 | |||
vpmuludq %ymm15,%ymm13,%ymm13 | |||
vpaddd %ymm2,%ymm10,%ymm15 | |||
vpsubd %ymm11,%ymm15,%ymm15 | |||
vpaddd %ymm4,%ymm5,%ymm4 | |||
vpaddd %ymm6,%ymm7,%ymm6 | |||
vpbroadcastd 8(%rdx),%ymm5 | |||
vpbroadcastd 12(%rdx),%ymm7 | |||
vpmuludq %ymm5,%ymm14,%ymm14 | |||
vpmuludq %ymm7,%ymm15,%ymm15 | |||
vpaddd %ymm8,%ymm9,%ymm8 | |||
vpaddd %ymm10,%ymm11,%ymm10 | |||
vpmuludq %ymm0,%ymm12,%ymm5 | |||
vpmuludq %ymm0,%ymm13,%ymm7 | |||
vpmuludq %ymm0,%ymm14,%ymm9 | |||
vpmuludq %ymm0,%ymm15,%ymm11 | |||
vpmuludq %ymm1,%ymm5,%ymm5 | |||
vpmuludq %ymm1,%ymm7,%ymm7 | |||
vpmuludq %ymm1,%ymm9,%ymm9 | |||
vpmuludq %ymm1,%ymm11,%ymm11 | |||
vpaddq %ymm12,%ymm5,%ymm5 | |||
vpaddq %ymm13,%ymm7,%ymm7 | |||
vpaddq %ymm14,%ymm9,%ymm9 | |||
vpaddq %ymm15,%ymm11,%ymm11 | |||
vpsrlq $32,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm9,%ymm9 | |||
vpsrlq $32,%ymm11,%ymm11 | |||
level6: | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||
vpbroadcastd 16(%rdx),%ymm15 | |||
vpbroadcastd 20(%rdx),%ymm3 | |||
butterfly 4,5,8,9,6,7,10,11 | |||
level7: | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||
vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 | |||
vpmuludq %ymm3,%ymm4,%ymm4 | |||
vpmuludq %ymm3,%ymm5,%ymm5 | |||
vpmuludq %ymm3,%ymm6,%ymm6 | |||
vpmuludq %ymm3,%ymm7,%ymm7 | |||
vpmuludq %ymm0,%ymm4,%ymm12 | |||
vpmuludq %ymm0,%ymm5,%ymm13 | |||
vpmuludq %ymm0,%ymm6,%ymm14 | |||
vpmuludq %ymm0,%ymm7,%ymm15 | |||
vpmuludq %ymm1,%ymm12,%ymm12 | |||
vpmuludq %ymm1,%ymm13,%ymm13 | |||
vpmuludq %ymm1,%ymm14,%ymm14 | |||
vpmuludq %ymm1,%ymm15,%ymm15 | |||
vpaddq %ymm12,%ymm4,%ymm4 | |||
vpaddq %ymm13,%ymm5,%ymm5 | |||
vpaddq %ymm14,%ymm6,%ymm6 | |||
vpaddq %ymm15,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vpsrlq $32,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm6,%ymm6 | |||
vpsrlq $32,%ymm7,%ymm7 | |||
#store | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 | |||
vpermd %ymm4,%ymm3,%ymm4 | |||
vpermd %ymm5,%ymm3,%ymm5 | |||
vpermd %ymm6,%ymm3,%ymm6 | |||
vpermd %ymm7,%ymm3,%ymm7 | |||
vpermd %ymm8,%ymm3,%ymm8 | |||
vpermd %ymm9,%ymm3,%ymm9 | |||
vpermd %ymm10,%ymm3,%ymm10 | |||
vpermd %ymm11,%ymm3,%ymm11 | |||
vmovdqa %xmm4,(%rdi) | |||
vmovdqa %xmm5,128(%rdi) | |||
vmovdqa %xmm6,256(%rdi) | |||
vmovdqa %xmm7,384(%rdi) | |||
vmovdqa %xmm8,512(%rdi) | |||
vmovdqa %xmm9,640(%rdi) | |||
vmovdqa %xmm10,768(%rdi) | |||
vmovdqa %xmm11,896(%rdi) | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t5 0 | |||
levels0t5 1 | |||
levels0t5 2 | |||
levels0t5 3 | |||
levels6t7 0 | |||
levels6t7 1 | |||
levels6t7 2 | |||
levels6t7 3 | |||
ret |
@@ -1,179 +1,199 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
#include "cdecl.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | |||
#mul | |||
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 | |||
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 | |||
vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 | |||
vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 | |||
#reduce | |||
vpmuludq %ymm0,%ymm\rh0,%ymm12 | |||
vpmuludq %ymm0,%ymm\rh1,%ymm13 | |||
vpmuludq %ymm0,%ymm\rh2,%ymm14 | |||
vpmuludq %ymm0,%ymm\rh3,%ymm15 | |||
vpmuludq %ymm1,%ymm12,%ymm12 | |||
vpmuludq %ymm1,%ymm13,%ymm13 | |||
vpmuludq %ymm1,%ymm14,%ymm14 | |||
vpmuludq %ymm1,%ymm15,%ymm15 | |||
vpaddq %ymm\rh0,%ymm12,%ymm12 | |||
vpaddq %ymm\rh1,%ymm13,%ymm13 | |||
vpaddq %ymm\rh2,%ymm14,%ymm14 | |||
vpaddq %ymm\rh3,%ymm15,%ymm15 | |||
vpsrlq $32,%ymm12,%ymm12 | |||
vpsrlq $32,%ymm13,%ymm13 | |||
vpsrlq $32,%ymm14,%ymm14 | |||
vpsrlq $32,%ymm15,%ymm15 | |||
#update | |||
vpaddd %ymm2,%ymm\rl0,%ymm\rh0 | |||
vpaddd %ymm2,%ymm\rl1,%ymm\rh1 | |||
vpaddd %ymm2,%ymm\rl2,%ymm\rh2 | |||
vpaddd %ymm2,%ymm\rl3,%ymm\rh3 | |||
vpaddd %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpaddd %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpaddd %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpaddd %ymm15,%ymm\rl3,%ymm\rl3 | |||
vpsubd %ymm12,%ymm\rh0,%ymm\rh0 | |||
vpsubd %ymm13,%ymm\rh1,%ymm\rh1 | |||
vpsubd %ymm14,%ymm\rh2,%ymm\rh2 | |||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||
level0: | |||
#zetas | |||
vpbroadcastd (%rdx),%ymm3 | |||
#load | |||
vpmovzxdq (%rsi),%ymm4 | |||
vpmovzxdq 128(%rsi),%ymm5 | |||
vpmovzxdq 256(%rsi),%ymm6 | |||
vpmovzxdq 384(%rsi),%ymm7 | |||
vpmovzxdq 512(%rsi),%ymm8 | |||
vpmovzxdq 640(%rsi),%ymm9 | |||
vpmovzxdq 768(%rsi),%ymm10 | |||
vpmovzxdq 896(%rsi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
level1: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpbroadcastd 4(%rdx),%ymm12 | |||
vpbroadcastd 8(%rdx),%ymm13 | |||
butterfly 4,5,8,9,6,7,10,11,12,12,13,13 | |||
level2: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpbroadcastd 12(%rdx),%ymm12 | |||
vpbroadcastd 16(%rdx),%ymm13 | |||
vpbroadcastd 20(%rdx),%ymm14 | |||
vpbroadcastd 24(%rdx),%ymm15 | |||
butterfly 4,6,8,10,5,7,9,11,12,13,14,15 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,256(%rdi) | |||
vmovdqa %ymm6,512(%rdi) | |||
vmovdqa %ymm7,768(%rdi) | |||
vmovdqa %ymm8,1024(%rdi) | |||
vmovdqa %ymm9,1280(%rdi) | |||
vmovdqa %ymm10,1536(%rdi) | |||
vmovdqa %ymm11,1792(%rdi) | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpmuldq %ymm\zl0,%ymm\h,%ymm13 | |||
vmovshdup %ymm\h,%ymm12 | |||
vpmuldq %ymm\zl1,%ymm12,%ymm14 | |||
ret | |||
vpmuldq %ymm\zh0,%ymm\h,%ymm\h | |||
vpmuldq %ymm\zh1,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vmovshdup %ymm\h,%ymm\h | |||
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h | |||
vpsubd %ymm\h,%ymm\l,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
vmovdqa 32(%rsi),%ymm5 | |||
vmovdqa 64(%rsi),%ymm6 | |||
vmovdqa 96(%rsi),%ymm7 | |||
vmovdqa 128(%rsi),%ymm8 | |||
vmovdqa 160(%rsi),%ymm9 | |||
vmovdqa 192(%rsi),%ymm10 | |||
vmovdqa 224(%rsi),%ymm11 | |||
level3: | |||
#zetas | |||
vpbroadcastd (%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
level4: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpbroadcastd 4(%rdx),%ymm12 | |||
vpbroadcastd 8(%rdx),%ymm13 | |||
vpblendd $0xF0,%ymm13,%ymm12,%ymm12 | |||
vmovshdup %ymm13,%ymm13 | |||
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 | |||
vpaddd %ymm13,%ymm12,%ymm\h | |||
vpsubd %ymm13,%ymm\l,%ymm\l | |||
.endm | |||
.macro levels0t1 off | |||
/* level 0 */ | |||
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
/* level 1 */ | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
.endm | |||
.macro levels2t7 off | |||
/* level 2 */ | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | |||
/* level 3 */ | |||
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 | |||
level5: | |||
#zetas | |||
vpmovzxdq 12(%rdx),%ymm12 | |||
butterfly 3,5 | |||
butterfly 8,10 | |||
butterfly 4,6 | |||
butterfly 9,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | |||
level6: | |||
#zetas | |||
vpmovzxdq 28(%rdx),%ymm12 | |||
vpmovzxdq 44(%rdx),%ymm13 | |||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | |||
level7: | |||
#zetas | |||
vpmovzxdq 60(%rdx),%ymm12 | |||
vpmovzxdq 76(%rdx),%ymm13 | |||
vpmovzxdq 92(%rdx),%ymm14 | |||
vpmovzxdq 108(%rdx),%ymm15 | |||
butterfly 7,3,8,4,5,10,6,11,12,13,14,15 | |||
#store | |||
vpsllq $32,%ymm5,%ymm5 | |||
vpsllq $32,%ymm10,%ymm10 | |||
vpsllq $32,%ymm6,%ymm6 | |||
vpsllq $32,%ymm11,%ymm11 | |||
vpblendd $0xAA,%ymm5,%ymm7,%ymm7 | |||
vpblendd $0xAA,%ymm10,%ymm3,%ymm3 | |||
vpblendd $0xAA,%ymm6,%ymm8,%ymm8 | |||
vpblendd $0xAA,%ymm11,%ymm4,%ymm4 | |||
/* level 4 */ | |||
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 | |||
butterfly 7,8 | |||
butterfly 5,6 | |||
butterfly 3,4 | |||
butterfly 10,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
/* level 5 */ | |||
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,5,1,10,2,15 | |||
butterfly 8,4,1,10,2,15 | |||
butterfly 7,3,1,10,2,15 | |||
butterfly 6,11,1,10,2,15 | |||
/* level 6 */ | |||
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,7,1,10,2,15 | |||
butterfly 8,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,3,1,10,2,15 | |||
butterfly 4,11,1,10,2,15 | |||
/* level 7 */ | |||
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,8,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 7,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,4,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 3,11,1,10,2,15 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm8,256*\off+ 32(%rdi) | |||
vmovdqa %ymm7,256*\off+ 64(%rdi) | |||
vmovdqa %ymm6,256*\off+ 96(%rdi) | |||
vmovdqa %ymm5,256*\off+128(%rdi) | |||
vmovdqa %ymm4,256*\off+160(%rdi) | |||
vmovdqa %ymm3,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
shuffle4 7,3,5,3 | |||
shuffle4 8,4,7,4 | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
shuffle8 5,7,6,7 | |||
shuffle8 3,4,5,4 | |||
levels0t1 0 | |||
levels0t1 1 | |||
levels0t1 2 | |||
levels0t1 3 | |||
vmovdqa %ymm6,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm7,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
levels2t7 0 | |||
levels2t7 1 | |||
levels2t7 2 | |||
levels2t7 3 | |||
ret | |||
@@ -1,36 +1,14 @@ | |||
#ifndef NTT_H | |||
#define NTT_H | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTT_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_NTT_H | |||
#include <stdint.h> | |||
#include <immintrin.h> | |||
#include "nttconsts.h" | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx(__m256i *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); | |||
#endif |
@@ -1,80 +0,0 @@ | |||
#include "nttconsts.h" | |||
#define QINV 4236238847 // -q^(-1) mod 2^32 | |||
#define MONT 4193792ULL | |||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
256 * Q | |||
} | |||
}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
0x7FFFFF, 0x7FFFFF | |||
} | |||
}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
#undef QINV | |||
#undef MONT | |||
#undef DIV | |||
const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas = { | |||
.as_arr = { | |||
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, | |||
3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, | |||
5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, | |||
3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, | |||
7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, | |||
1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, | |||
8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, | |||
7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, | |||
5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, | |||
3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, | |||
6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, | |||
5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, | |||
3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, | |||
7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, | |||
6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, | |||
2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, | |||
162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, | |||
8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, | |||
1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, | |||
6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, | |||
5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, | |||
7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, | |||
4834730, 7018208, 1976782 | |||
} | |||
}; | |||
const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv = { | |||
.as_arr = { | |||
6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, | |||
4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, | |||
2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, | |||
1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, | |||
177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, | |||
6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, | |||
5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, | |||
6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, | |||
1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, | |||
1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, | |||
1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, | |||
7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, | |||
6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, | |||
7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, | |||
5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, | |||
4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, | |||
43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, | |||
1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, | |||
6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, | |||
4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, | |||
3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, | |||
518909, 2608894, 3975713 | |||
} | |||
}; |
@@ -1,27 +0,0 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include "alignment.h" | |||
#include "params.h" | |||
typedef ALIGNED_UINT32(8) aligned_uint32x8_t; | |||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv; | |||
#endif //PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H | |||
@@ -3,6 +3,7 @@ | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk | |||
* | |||
@@ -12,17 +13,18 @@ | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_pk( | |||
uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
@@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM2_AVX2_pack_pk( | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk( | |||
uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t key[]: byte array containing key | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
* - const polyveck *t0: pointer to vector t0 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sk( | |||
uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const polyvecl *s1, | |||
const polyveck *s2, | |||
const polyveck *t0) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < CRHBYTES; ++i) { | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLETA_SIZE_PACKED; | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLETA_SIZE_PACKED; | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t key[]: output byte array for key | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - const polyveck *r0: pointer to output vector t0 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk( | |||
uint8_t rho[SEEDBYTES], | |||
uint8_t key[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
polyvecl *s1, | |||
polyveck *s2, | |||
polyveck *t0, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < CRHBYTES; ++i) { | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLETA_SIZE_PACKED; | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLETA_SIZE_PACKED; | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (z, h, c). | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
* - const poly *c: pointer to challenge polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sig( | |||
uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const polyvecl *z, | |||
const polyveck *h, | |||
const poly *c) { | |||
size_t k; | |||
uint64_t signs, mask; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += L * POLZ_SIZE_PACKED; | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (size_t i = 0; i < K; ++i) { | |||
for (size_t j = 0; j < N; ++j) { | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t)j; | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t)k; | |||
} | |||
while (k < OMEGA) { | |||
sig[k++] = 0; | |||
} | |||
sig += OMEGA + K; | |||
/* Encode c */ | |||
signs = 0; | |||
mask = 1; | |||
for (size_t i = 0; i < N / 8; ++i) { | |||
sig[i] = 0; | |||
for (size_t j = 0; j < 8; ++j) { | |||
if (c->coeffs[8 * i + j] != 0) { | |||
sig[i] |= (uint8_t)(1u << j); | |||
if (c->coeffs[8 * i + j] == (Q - 1)) { | |||
signs |= mask; | |||
} | |||
mask <<= 1; | |||
} | |||
} | |||
} | |||
sig += N / 8; | |||
for (size_t i = 0; i < 8; ++i) { | |||
sig[i] = (uint8_t)(signs >> 8u * i); | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (z, h, c). | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - polyvecl *z: pointer to output vector z | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - poly *c: pointer to output challenge polynomial | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( | |||
polyvecl *z, | |||
polyveck *h, | |||
poly *c, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) { | |||
size_t k; | |||
uint64_t signs; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); | |||
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += L * POLZ_SIZE_PACKED; | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (size_t i = 0; i < K; ++i) { | |||
for (size_t j = 0; j < N; ++j) { | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
@@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( | |||
return 1; | |||
} | |||
for (size_t j = k; j < sig[OMEGA + i]; ++j) { | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
@@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (size_t j = k; j < OMEGA; ++j) { | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
sig += OMEGA + K; | |||
/* Decode c */ | |||
for (size_t i = 0; i < N; ++i) { | |||
c->coeffs[i] = 0; | |||
} | |||
signs = 0; | |||
for (size_t i = 0; i < 8; ++i) { | |||
signs |= (uint64_t)sig[N / 8 + i] << 8 * i; | |||
} | |||
/* Extra sign bits are zero for strong unforgeability */ | |||
if (signs >> 60) { | |||
return 1; | |||
} | |||
for (size_t i = 0; i < N / 8; ++i) { | |||
for (size_t j = 0; j < 8; ++j) { | |||
if ((sig[i] >> j) & 0x01) { | |||
c->coeffs[8 * i + j] = 1; | |||
c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); | |||
signs >>= 1; | |||
} | |||
} | |||
} | |||
return 0; | |||
} |
@@ -1,42 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_PACKING_H | |||
#include "api.h" | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_pk( | |||
uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sk( | |||
uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const uint8_t tr[SEEDBYTES], | |||
const polyvecl *s1, | |||
const polyveck *s2, | |||
const polyveck *t0); | |||
void PQCLEAN_DILITHIUM2_AVX2_pack_sig( | |||
uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const polyvecl *z, const polyveck *h, const poly *c); | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk( | |||
uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk( | |||
uint8_t rho[SEEDBYTES], | |||
uint8_t key[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
polyvecl *s1, | |||
polyveck *s2, | |||
polyveck *t0, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( | |||
polyvecl *z, | |||
polyveck *h, | |||
poly *c, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]); | |||
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]); | |||
#endif |
@@ -2,28 +2,40 @@ | |||
#define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define QBITS 23 | |||
#define D 14 | |||
#define GAMMA1 ((Q - 1)/16) | |||
#define GAMMA2 (GAMMA1/2) | |||
#define ALPHA (2*GAMMA2) | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 4 | |||
#define L 3 | |||
#define ETA 6 | |||
#define SETABITS 4 | |||
#define BETA 325 | |||
#define L 4 | |||
#define ETA 2 | |||
#define TAU 39 | |||
#define BETA 78 | |||
#define GAMMA1 (1 << 17) | |||
#define GAMMA2 ((Q-1)/88) | |||
#define OMEGA 80 | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 576 | |||
#define POLYW1_PACKEDBYTES 192 | |||
#define POLYETA_PACKEDBYTES 96 | |||
#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) | |||
#define POLT0_SIZE_PACKED ((N*D)/8) | |||
#define POLETA_SIZE_PACKED ((N*SETABITS)/8) | |||
#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) | |||
#define POLW1_SIZE_PACKED ((N*4)/8) | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -1,11 +1,14 @@ | |||
#include "params.h" | |||
#include "cdecl.inc" | |||
#include "cdecl.h" | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
@@ -18,41 +21,41 @@ vmovdqa 32(%rdx),%ymm12 | |||
vmovdqa 64(%rdx),%ymm14 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vpsrlq $32,%ymm6,%ymm7 | |||
vmovshdup %ymm6,%ymm7 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
vpsrlq $32,%ymm12,%ymm13 | |||
vpsrlq $32,%ymm14,%ymm15 | |||
vmovshdup %ymm14,%ymm15 | |||
#mul | |||
vpmuludq %ymm2,%ymm10,%ymm2 | |||
vpmuludq %ymm3,%ymm11,%ymm3 | |||
vpmuludq %ymm4,%ymm12,%ymm4 | |||
vpmuludq %ymm5,%ymm13,%ymm5 | |||
vpmuludq %ymm6,%ymm14,%ymm6 | |||
vpmuludq %ymm7,%ymm15,%ymm7 | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
vpmuldq %ymm6,%ymm14,%ymm6 | |||
vpmuldq %ymm7,%ymm15,%ymm7 | |||
#reduce | |||
vpmuludq %ymm0,%ymm2,%ymm10 | |||
vpmuludq %ymm0,%ymm3,%ymm11 | |||
vpmuludq %ymm0,%ymm4,%ymm12 | |||
vpmuludq %ymm0,%ymm5,%ymm13 | |||
vpmuludq %ymm0,%ymm6,%ymm14 | |||
vpmuludq %ymm0,%ymm7,%ymm15 | |||
vpmuludq %ymm1,%ymm10,%ymm10 | |||
vpmuludq %ymm1,%ymm11,%ymm11 | |||
vpmuludq %ymm1,%ymm12,%ymm12 | |||
vpmuludq %ymm1,%ymm13,%ymm13 | |||
vpmuludq %ymm1,%ymm14,%ymm14 | |||
vpmuludq %ymm1,%ymm15,%ymm15 | |||
vpaddq %ymm2,%ymm10,%ymm2 | |||
vpaddq %ymm3,%ymm11,%ymm3 | |||
vpaddq %ymm4,%ymm12,%ymm4 | |||
vpaddq %ymm5,%ymm13,%ymm5 | |||
vpaddq %ymm6,%ymm14,%ymm6 | |||
vpaddq %ymm7,%ymm15,%ymm7 | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm0,%ymm6,%ymm14 | |||
vpmuldq %ymm0,%ymm7,%ymm15 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpmuldq %ymm1,%ymm14,%ymm14 | |||
vpmuldq %ymm1,%ymm15,%ymm15 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsubq %ymm14,%ymm6,%ymm6 | |||
vpsubq %ymm15,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vpsrlq $32,%ymm6,%ymm6 | |||
vmovshdup %ymm6,%ymm6 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
@@ -67,7 +70,7 @@ add $96,%rsi | |||
add $96,%rdx | |||
add $1,%eax | |||
cmp $10,%eax | |||
jb _looptop1 | |||
jb _looptop1 | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
@@ -75,30 +78,30 @@ vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
vpsrlq $32,%ymm12,%ymm13 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuludq %ymm2,%ymm10,%ymm2 | |||
vpmuludq %ymm3,%ymm11,%ymm3 | |||
vpmuludq %ymm4,%ymm12,%ymm4 | |||
vpmuludq %ymm5,%ymm13,%ymm5 | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
#reduce | |||
vpmuludq %ymm0,%ymm2,%ymm10 | |||
vpmuludq %ymm0,%ymm3,%ymm11 | |||
vpmuludq %ymm0,%ymm4,%ymm12 | |||
vpmuludq %ymm0,%ymm5,%ymm13 | |||
vpmuludq %ymm1,%ymm10,%ymm10 | |||
vpmuludq %ymm1,%ymm11,%ymm11 | |||
vpmuludq %ymm1,%ymm12,%ymm12 | |||
vpmuludq %ymm1,%ymm13,%ymm13 | |||
vpaddq %ymm2,%ymm10,%ymm2 | |||
vpaddq %ymm3,%ymm11,%ymm3 | |||
vpaddq %ymm4,%ymm12,%ymm4 | |||
vpaddq %ymm5,%ymm13,%ymm5 | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0x55,%ymm2,%ymm3,%ymm2 | |||
@@ -116,14 +119,14 @@ vmovdqa \off(%rdx),%ymm10 | |||
vmovdqa \off+32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm6,%ymm7 | |||
vpsrlq $32,%ymm8,%ymm9 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
vpsrlq $32,%ymm12,%ymm13 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuludq %ymm6,%ymm10,%ymm6 | |||
vpmuludq %ymm7,%ymm11,%ymm7 | |||
vpmuludq %ymm8,%ymm12,%ymm8 | |||
vpmuludq %ymm9,%ymm13,%ymm9 | |||
vpmuldq %ymm6,%ymm10,%ymm6 | |||
vpmuldq %ymm7,%ymm11,%ymm7 | |||
vpmuldq %ymm8,%ymm12,%ymm8 | |||
vpmuldq %ymm9,%ymm13,%ymm9 | |||
.endm | |||
.macro acc | |||
@@ -134,10 +137,12 @@ vpaddq %ymm9,%ymm5,%ymm5 | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
@@ -155,23 +160,27 @@ acc | |||
pointwise 2048 | |||
acc | |||
pointwise 3072 | |||
acc | |||
#reduce | |||
vpmuludq %ymm0,%ymm2,%ymm6 | |||
vpmuludq %ymm0,%ymm3,%ymm7 | |||
vpmuludq %ymm0,%ymm4,%ymm8 | |||
vpmuludq %ymm0,%ymm5,%ymm9 | |||
vpmuludq %ymm1,%ymm6,%ymm6 | |||
vpmuludq %ymm1,%ymm7,%ymm7 | |||
vpmuludq %ymm1,%ymm8,%ymm8 | |||
vpmuludq %ymm1,%ymm9,%ymm9 | |||
vpaddq %ymm2,%ymm6,%ymm2 | |||
vpaddq %ymm3,%ymm7,%ymm3 | |||
vpaddq %ymm4,%ymm8,%ymm4 | |||
vpaddq %ymm5,%ymm9,%ymm5 | |||
vpmuldq %ymm0,%ymm2,%ymm6 | |||
vpmuldq %ymm0,%ymm3,%ymm7 | |||
vpmuldq %ymm0,%ymm4,%ymm8 | |||
vpmuldq %ymm0,%ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm6,%ymm6 | |||
vpmuldq %ymm1,%ymm7,%ymm7 | |||
vpmuldq %ymm1,%ymm8,%ymm8 | |||
vpmuldq %ymm1,%ymm9,%ymm9 | |||
vpsubq %ymm6,%ymm2,%ymm2 | |||
vpsubq %ymm7,%ymm3,%ymm3 | |||
vpsubq %ymm8,%ymm4,%ymm4 | |||
vpsubq %ymm9,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
@@ -1,19 +1,14 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include "alignment.h" | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLY_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_POLY_H | |||
#include "align.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
typedef union { | |||
uint32_t coeffs[N]; | |||
__m256i coeffs_x8[N / 8]; | |||
} poly; | |||
typedef ALIGNED_INT32(N) poly; | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b); | |||
@@ -21,63 +16,64 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, | |||
const uint8_t *seed, | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t *seed, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, | |||
const uint8_t *seed, | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t *seed, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a, | |||
const uint8_t *seed, | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0, | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(poly *a0, | |||
poly *a1, | |||
poly *a2, | |||
poly *a3, | |||
const uint8_t *seed, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t *r, const poly *a); | |||
#endif |
@@ -1,14 +1,103 @@ | |||
#include <stdint.h> | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
#define UNUSED(x) (void)x | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(&mat[0], NULL, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(&mat[1], NULL, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(&mat[2], NULL, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 256, 257, 258, 259); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 512, 513, 514, 515); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { | |||
UNUSED(rowb); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 768, 769, 770, 771); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze | |||
* | |||
@@ -18,7 +107,9 @@ | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) { | |||
for (size_t i = 0; i < L; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
@@ -34,7 +125,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) { | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
for (size_t i = 0; i < L; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -48,44 +141,60 @@ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) { | |||
for (size_t i = 0; i < L; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* Input coefficients are assumed to be less than 22*Q. Output | |||
* coeffcient are less than 2*L*Q. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v) { | |||
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { | |||
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input coefficients to be standard representatives. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - uint32_t B: norm bound | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B and 1 | |||
* otherwise. | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { | |||
for (size_t i = 0; i < L; ++i) { | |||
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
@@ -98,37 +207,48 @@ int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [0,2*Q[. | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* subtract Q if coefficient is bigger than Q. | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_csubq(&v->vec[i]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyveck_freeze | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to standard representatives. | |||
@@ -136,7 +256,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) { | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
@@ -152,7 +274,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) { | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -161,8 +285,7 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* Assumes coefficients of polynomials in second input vector | |||
* to be less than 2*Q. No modular reduction is performed. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
@@ -170,7 +293,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -179,12 +304,14 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{32-D}. | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
@@ -198,13 +325,15 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) { | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
@@ -212,9 +341,19 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) { | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&v->vec[i]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
@@ -222,16 +361,18 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) { | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input coefficients to be standard representatives. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - uint32_t B: norm bound | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B and 1 | |||
* otherwise. | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { | |||
for (size_t i = 0; i < K; ++i) { | |||
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
@@ -244,18 +385,20 @@ int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod Q = a1*2^D + a0 | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients Q + a0 | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -264,7 +407,7 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
@@ -272,12 +415,13 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients Q + a0 | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose( | |||
polyveck *v1, polyveck *v0, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -287,37 +431,44 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose( | |||
* | |||
* Description: Compute hint vector. | |||
* | |||
* Arguments: - polyveck *h: pointer to output vector | |||
* Arguments: - uint8_t *hint: pointer to output hint array | |||
* - const polyveck *v0: pointer to low part of input vector | |||
* - const polyveck *v1: pointer to high part of input vector | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint( | |||
polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1) { | |||
uint32_t s = 0; | |||
for (size_t i = 0; i < K; ++i) { | |||
s += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { | |||
unsigned int i, n = 0; | |||
for (i = 0; i < K; ++i) { | |||
n += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return s; | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: polyveck_use_hint | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint | |||
* | |||
* Description: Use hint vector to correct the high bits of input vector. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *v: pointer to input vector | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -1,58 +1,72 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery( | |||
poly *w, const polyvecl *u, const polyvecl *v); | |||
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
/* Vectors of polynomials of length K */ | |||
typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add( | |||
polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub( | |||
polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v); | |||
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm( | |||
const polyveck *v, uint32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round( | |||
polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose( | |||
polyveck *v1, polyveck *v0, const polyveck *v); | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint( | |||
polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint( | |||
polyveck *w, const polyveck *v, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -1,93 +0,0 @@ | |||
#include "cdecl.inc" | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_rdc32: | |||
#load | |||
vmovdqa (%rdi),%ymm1 | |||
vmovdqa 32(%rdi),%ymm3 | |||
vmovdqa 64(%rdi),%ymm5 | |||
vmovdqa 96(%rdi),%ymm7 | |||
#reduce | |||
vpsrld $23,%ymm1,%ymm2 | |||
vpsrld $23,%ymm3,%ymm4 | |||
vpsrld $23,%ymm5,%ymm6 | |||
vpsrld $23,%ymm7,%ymm8 | |||
vpand %ymm0,%ymm1,%ymm1 | |||
vpand %ymm0,%ymm3,%ymm3 | |||
vpand %ymm0,%ymm5,%ymm5 | |||
vpand %ymm0,%ymm7,%ymm7 | |||
vpsubd %ymm2,%ymm1,%ymm1 | |||
vpsubd %ymm4,%ymm3,%ymm3 | |||
vpsubd %ymm6,%ymm5,%ymm5 | |||
vpsubd %ymm8,%ymm7,%ymm7 | |||
vpslld $13,%ymm2,%ymm2 | |||
vpslld $13,%ymm4,%ymm4 | |||
vpslld $13,%ymm6,%ymm6 | |||
vpslld $13,%ymm8,%ymm8 | |||
vpaddd %ymm2,%ymm1,%ymm1 | |||
vpaddd %ymm4,%ymm3,%ymm3 | |||
vpaddd %ymm6,%ymm5,%ymm5 | |||
vpaddd %ymm8,%ymm7,%ymm7 | |||
#store | |||
vmovdqa %ymm1,(%rdi) | |||
vmovdqa %ymm3,32(%rdi) | |||
vmovdqa %ymm5,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
add $128,%rdi | |||
add $1,%eax | |||
cmp $8,%eax | |||
jb _looptop_rdc32 | |||
ret | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_csubq: | |||
#load | |||
vmovdqa (%rdi),%ymm1 | |||
vmovdqa 32(%rdi),%ymm3 | |||
vmovdqa 64(%rdi),%ymm5 | |||
vmovdqa 96(%rdi),%ymm7 | |||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq) | |||
vpsubd %ymm0,%ymm1,%ymm1 | |||
vpsubd %ymm0,%ymm3,%ymm3 | |||
vpsubd %ymm0,%ymm5,%ymm5 | |||
vpsubd %ymm0,%ymm7,%ymm7 | |||
vpsrad $31,%ymm1,%ymm2 | |||
vpsrad $31,%ymm3,%ymm4 | |||
vpsrad $31,%ymm5,%ymm6 | |||
vpsrad $31,%ymm7,%ymm8 | |||
vpand %ymm0,%ymm2,%ymm2 | |||
vpand %ymm0,%ymm4,%ymm4 | |||
vpand %ymm0,%ymm6,%ymm6 | |||
vpand %ymm0,%ymm8,%ymm8 | |||
vpaddd %ymm2,%ymm1,%ymm1 | |||
vpaddd %ymm4,%ymm3,%ymm3 | |||
vpaddd %ymm6,%ymm5,%ymm5 | |||
vpaddd %ymm8,%ymm7,%ymm7 | |||
#store | |||
vmovdqa %ymm1,(%rdi) | |||
vmovdqa %ymm3,32(%rdi) | |||
vmovdqa %ymm5,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
add $128,%rdi | |||
add $1,%eax | |||
cmp $8,%eax | |||
jb _looptop_csubq | |||
ret |
@@ -1,9 +0,0 @@ | |||
#ifndef REDUCE_H | |||
#define REDUCE_H | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_reduce_avx(uint32_t a[N]); | |||
void PQCLEAN_DILITHIUM2_AVX2_csubq_avx(uint32_t a[N]); | |||
#endif |
@@ -1,9 +1,10 @@ | |||
#include <immintrin.h> | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
static const uint8_t idx[256][8] = { | |||
const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8] = { | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 0, 0, 0, 0, 0, 0, 0}, | |||
@@ -262,178 +263,144 @@ static const uint8_t idx[256][8] = { | |||
{ 0, 1, 2, 3, 4, 5, 6, 7} | |||
}; | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_uniform( | |||
uint32_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen) { | |||
uint32_t ctr, pos, vec[8]; | |||
__m256i d, tmp; | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i d, tmp; | |||
const __m256i bound = _mm256_set1_epi32(Q); | |||
const __m256i mask = _mm256_set1_epi32(0x7FFFFF); | |||
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, | |||
-1, 9, 8, 7, -1, 6, 5, 4, | |||
-1, 11, 10, 9, -1, 8, 7, 6, | |||
-1, 5, 4, 3, -1, 2, 1, 0); | |||
ctr = pos = 0; | |||
while (ctr + 8 <= len && pos + 24 <= buflen) { | |||
for (size_t i = 0; i < 8; i++) { | |||
vec[i] = buf[pos++]; | |||
vec[i] |= (uint32_t)buf[pos++] << 8; | |||
vec[i] |= (uint32_t)buf[pos++] << 16; | |||
vec[i] &= 0x7FFFFF; | |||
} | |||
while (pos <= REJ_UNIFORM_BUFLEN - 24) { | |||
d = _mm256_loadu_si256((__m256i *)&buf[pos]); | |||
d = _mm256_permute4x64_epi64(d, 0x94); | |||
d = _mm256_shuffle_epi8(d, idx8); | |||
d = _mm256_and_si256(d, mask); | |||
pos += 24; | |||
d = _mm256_loadu_si256((__m256i_u *)vec); | |||
tmp = _mm256_cmpgt_epi32(bound, d); | |||
tmp = _mm256_sub_epi32(d, bound); | |||
good = _mm256_movemask_ps((__m256)tmp); | |||
__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); | |||
tmp = _mm256_cvtepu8_epi32(rid); | |||
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good])); | |||
d = _mm256_permutevar8x32_epi32(d, tmp); | |||
_mm256_storeu_si256((__m256i_u *)&r[ctr], d); | |||
ctr += __builtin_popcount(good); | |||
} | |||
while (ctr < len && pos + 3 <= buflen) { | |||
vec[0] = buf[pos++]; | |||
vec[0] |= (uint32_t)buf[pos++] << 8; | |||
vec[0] |= (uint32_t)buf[pos++] << 16; | |||
vec[0] &= 0x7FFFFF; | |||
_mm256_storeu_si256((__m256i *)&r[ctr], d); | |||
ctr += _mm_popcnt_u32(good); | |||
if (vec[0] < Q) { | |||
r[ctr++] = vec[0]; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
} | |||
return ctr; | |||
} | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_eta( | |||
uint32_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen) { | |||
uint32_t ctr, pos; | |||
uint8_t vec[32]; | |||
__m256i tmp0, tmp1; | |||
__m128i d0, d1, rid; | |||
uint32_t good; | |||
const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); | |||
const __m256i off = _mm256_set1_epi32(Q + ETA); | |||
uint32_t t; | |||
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) { | |||
t = buf[pos++]; | |||
t |= (uint32_t)buf[pos++] << 8; | |||
t |= (uint32_t)buf[pos++] << 16; | |||
t &= 0x7FFFFF; | |||
ctr = pos = 0; | |||
while (ctr + 32 <= len && pos + 16 <= buflen) { | |||
for (size_t i = 0; i < 16; i++) { | |||
vec[2 * i + 0] = buf[pos] & 0x0F; | |||
vec[2 * i + 1] = buf[pos++] >> 4; | |||
} | |||
tmp0 = _mm256_loadu_si256((__m256i_u *)vec); | |||
tmp1 = _mm256_cmpgt_epi8(bound, tmp0); | |||
good = _mm256_movemask_epi8(tmp1); | |||
d0 = _mm256_castsi256_si128(tmp0); | |||
rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); | |||
d1 = _mm_shuffle_epi8(d0, rid); | |||
tmp1 = _mm256_cvtepu8_epi32(d1); | |||
tmp1 = _mm256_sub_epi32(off, tmp1); | |||
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); | |||
ctr += __builtin_popcount(good & 0xFF); | |||
d0 = _mm_bsrli_si128(d0, 8); | |||
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); | |||
d1 = _mm_shuffle_epi8(d0, rid); | |||
tmp1 = _mm256_cvtepu8_epi32(d1); | |||
tmp1 = _mm256_sub_epi32(off, tmp1); | |||
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); | |||
ctr += __builtin_popcount((good >> 8) & 0xFF); | |||
d0 = _mm256_extracti128_si256(tmp0, 1); | |||
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); | |||
d1 = _mm_shuffle_epi8(d0, rid); | |||
tmp1 = _mm256_cvtepu8_epi32(d1); | |||
tmp1 = _mm256_sub_epi32(off, tmp1); | |||
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); | |||
ctr += __builtin_popcount((good >> 16) & 0xFF); | |||
d0 = _mm_bsrli_si128(d0, 8); | |||
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); | |||
d1 = _mm_shuffle_epi8(d0, rid); | |||
tmp1 = _mm256_cvtepu8_epi32(d1); | |||
tmp1 = _mm256_sub_epi32(off, tmp1); | |||
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); | |||
ctr += __builtin_popcount((good >> 24) & 0xFF); | |||
} | |||
while (ctr < len && pos < buflen) { | |||
vec[0] = buf[pos] & 0x0F; | |||
vec[1] = buf[pos++] >> 4; | |||
if (vec[0] <= 2 * ETA) { | |||
r[ctr++] = Q + ETA - vec[0]; | |||
} | |||
if (vec[1] <= 2 * ETA && ctr < len) { | |||
r[ctr++] = Q + ETA - vec[1]; | |||
if (t < Q) { | |||
r[ctr++] = t; | |||
} | |||
} | |||
return ctr; | |||
} | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1( | |||
uint32_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen) { | |||
uint32_t ctr, pos; | |||
uint32_t vec[8]; | |||
__m256i d, tmp; | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); | |||
const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); | |||
__m256i f0, f1, f2; | |||
__m128i g0, g1; | |||
const __m256i mask = _mm256_set1_epi8(15); | |||
const __m256i eta = _mm256_set1_epi8(ETA); | |||
const __m256i bound = mask; | |||
const __m256i v = _mm256_set1_epi32(-6560); | |||
const __m256i p = _mm256_set1_epi32(5); | |||
ctr = pos = 0; | |||
while (ctr + 8 <= len && pos + 20 <= buflen) { | |||
for (size_t i = 0; i < 4; i++) { | |||
vec[2 * i + 0] = buf[pos + 0]; | |||
vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; | |||
vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; | |||
vec[2 * i + 0] &= 0xFFFFF; | |||
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { | |||
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); | |||
f1 = _mm256_slli_epi16(f0, 4); | |||
f0 = _mm256_or_si256(f0, f1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
vec[2 * i + 1] = buf[pos + 2] >> 4; | |||
vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; | |||
vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; | |||
f1 = _mm256_sub_epi8(f0, bound); | |||
f0 = _mm256_sub_epi8(eta, f0); | |||
good = _mm256_movemask_epi8(f1); | |||
pos += 5; | |||
g0 = _mm256_castsi256_si128(f0); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
d = _mm256_loadu_si256((__m256i_u *)vec); | |||
tmp = _mm256_cmpgt_epi32(bound, d); | |||
good = _mm256_movemask_ps((__m256)tmp); | |||
d = _mm256_sub_epi32(off, d); | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm256_extracti128_si256(f0, 1); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); | |||
tmp = _mm256_cvtepu8_epi32(rid); | |||
d = _mm256_permutevar8x32_epi32(d, tmp); | |||
_mm256_storeu_si256((__m256i_u *)&r[ctr], d); | |||
ctr += __builtin_popcount(good); | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 4; | |||
} | |||
while (ctr < len && pos + 5 <= buflen) { | |||
vec[0] = buf[pos + 0]; | |||
vec[0] |= (uint32_t)buf[pos + 1] << 8; | |||
vec[0] |= (uint32_t)buf[pos + 2] << 16; | |||
vec[0] &= 0xFFFFF; | |||
vec[1] = buf[pos + 2] >> 4; | |||
vec[1] |= (uint32_t)buf[pos + 3] << 4; | |||
vec[1] |= (uint32_t)buf[pos + 4] << 12; | |||
pos += 5; | |||
uint32_t t0, t1; | |||
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (vec[0] <= 2 * GAMMA1 - 2) { | |||
r[ctr++] = Q + GAMMA1 - 1 - vec[0]; | |||
if (t0 < 15) { | |||
t0 = t0 - (205 * t0 >> 10) * 5; | |||
r[ctr++] = 2 - t0; | |||
} | |||
if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { | |||
r[ctr++] = Q + GAMMA1 - 1 - vec[1]; | |||
if (t1 < 15 && ctr < N) { | |||
t1 = t1 - (205 * t1 >> 10) * 5; | |||
r[ctr++] = 2 - t1; | |||
} | |||
} | |||
@@ -1,25 +1,19 @@ | |||
#ifndef REJSAMPLE_H | |||
#define REJSAMPLE_H | |||
#include <stddef.h> | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_uniform( | |||
uint32_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen); | |||
#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) | |||
extern const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8]; | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_eta( | |||
uint32_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1( | |||
uint32_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); | |||
#endif |
@@ -1,115 +1,157 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "rounding.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define _mm256_blendv_epi32(a,b,mask) \ | |||
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ | |||
_mm256_castsi256_ps(b), \ | |||
_mm256_castsi256_ps(mask))) | |||
/************************************************* | |||
* Name: power2round | |||
* | |||
* Description: For finite field element a, compute a0, a1 such that | |||
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be standard representative. | |||
* Description: For finite field elements a, compute a0, a1 such that | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be positive standard representative. | |||
* | |||
* Arguments: - uint32_t a: input element | |||
* - uint32_t *a0: pointer to output element Q + a0 | |||
* Arguments: - __m256i *a1: output array of length N/8 with high bits | |||
* - __m256i *a0: output array of length N/8 with low bits a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0) { | |||
int32_t t; | |||
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1; | |||
const __m256i mask = _mm256_set1_epi32(-(1 << D)); | |||
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); | |||
/* Centralized remainder mod 2^D */ | |||
t = a & ((1U << D) - 1); | |||
t -= (1U << (D - 1)) + 1; | |||
t += (t >> 31) & (1U << D); | |||
t -= (1U << (D - 1)) - 1; | |||
*a0 = Q + t; | |||
a = (a - t) >> D; | |||
return a; | |||
for (i = 0; i < N / 8; ++i) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, half); | |||
f0 = _mm256_and_si256(f1, mask); | |||
f1 = _mm256_srli_epi32(f1, D); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_decompose | |||
* Name: decompose | |||
* | |||
* Description: For finite field element a, compute high and low bits a0, a1 such | |||
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* Description: For finite field element a, compute high and low parts a0, a1 such | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard | |||
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard | |||
* representative. | |||
* | |||
* Arguments: - uint32_t a: input element | |||
* - uint32_t *a0: pointer to output element Q + a0 | |||
* Arguments: - __m256i *a1: output array of length N/8 with high parts | |||
* - __m256i *a0: output array of length N/8 with low parts a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0) { | |||
int32_t t, u; | |||
/* Centralized remainder mod ALPHA */ | |||
t = a & 0x7FFFF; | |||
t += (a >> 19) << 9; | |||
t -= ALPHA / 2 + 1; | |||
t += (t >> 31) & ALPHA; | |||
t -= ALPHA / 2 - 1; | |||
a -= t; | |||
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1, t; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]); | |||
const __m256i hq = _mm256_srli_epi32(q, 1); | |||
const __m256i v = _mm256_set1_epi32(11275); | |||
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); | |||
const __m256i off = _mm256_set1_epi32(127); | |||
const __m256i shift = _mm256_set1_epi32(128); | |||
const __m256i max = _mm256_set1_epi32(43); | |||
const __m256i zero = _mm256_setzero_si256(); | |||
/* Divide by ALPHA (possible to avoid) */ | |||
u = a - 1; | |||
u >>= 31; | |||
a = (a >> 19) + 1; | |||
a -= u & 1; | |||
/* Border case */ | |||
*a0 = Q + t - (a >> 4); | |||
a &= 0xF; | |||
return a; | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, off); | |||
f1 = _mm256_srli_epi32(f1, 7); | |||
f1 = _mm256_mulhi_epu16(f1, v); | |||
f1 = _mm256_mulhrs_epi16(f1, shift); | |||
t = _mm256_sub_epi32(max, f1); | |||
f1 = _mm256_blendv_epi32(f1, zero, t); | |||
f0 = _mm256_mullo_epi32(f1, alpha); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
f = _mm256_cmpgt_epi32(f0, hq); | |||
f = _mm256_and_si256(f, q); | |||
f0 = _mm256_sub_epi32(f0, f); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_make_hint | |||
* Name: make_hint | |||
* | |||
* Description: Compute hint bit indicating whether the low bits of the | |||
* input element overflow into the high bits. Inputs assumed to be | |||
* standard representatives. | |||
* Description: Compute indices of polynomial coefficients whose low bits | |||
* overflow into the high bits. | |||
* | |||
* Arguments: - uint32_t a0: low bits of input element | |||
* - uint32_t a1: high bits of input element | |||
* Arguments: - uint8_t *hint: hint array | |||
* - const __m256i *a0: low bits of input elements | |||
* - const __m256i *a1: high bits of input elements | |||
* | |||
* Returns 1 if high bits of a and b differ and 0 otherwise. | |||
* Returns number of overflowing low bits | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { | |||
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { | |||
return 0; | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { | |||
unsigned int i, n = 0; | |||
__m256i f0, f1, g0, g1; | |||
uint32_t bad; | |||
uint64_t idx; | |||
const __m256i low = _mm256_set1_epi32(-GAMMA2); | |||
const __m256i high = _mm256_set1_epi32(GAMMA2); | |||
for (i = 0; i < N / 8; ++i) { | |||
f0 = _mm256_load_si256(&a0[i]); | |||
f1 = _mm256_load_si256(&a1[i]); | |||
g0 = _mm256_abs_epi32(f0); | |||
g0 = _mm256_cmpgt_epi32(g0, high); | |||
g1 = _mm256_cmpeq_epi32(f0, low); | |||
g1 = _mm256_sign_epi32(g1, f1); | |||
g0 = _mm256_or_si256(g0, g1); | |||
bad = _mm256_movemask_ps((__m256)g0); | |||
memcpy(&idx, PQCLEAN_DILITHIUM2_AVX2_idxlut[bad], 8); | |||
idx += (uint64_t)0x0808080808080808 * i; | |||
memcpy(&hint[n], &idx, 8); | |||
n += _mm_popcnt_u32(bad); | |||
} | |||
return 1; | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_use_hint | |||
* Name: use_hint | |||
* | |||
* Description: Correct high bits according to hint. | |||
* Description: Correct high parts according to hint. | |||
* | |||
* Arguments: - uint32_t a: input element | |||
* - unsigned int hint: hint bit | |||
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts | |||
* - const __m256i *a: input array of length N/8 | |||
* - const __m256i *a: input array of length N/8 with hint bits | |||
* | |||
* Returns corrected high bits. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(const uint32_t a, const unsigned int hint) { | |||
uint32_t a0, a1; | |||
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { | |||
unsigned int i; | |||
__m256i a0[N / 8]; | |||
__m256i f, g, h, t; | |||
const __m256i zero = _mm256_setzero_si256(); | |||
const __m256i max = _mm256_set1_epi32(43); | |||
a1 = PQCLEAN_DILITHIUM2_AVX2_decompose(a, &a0); | |||
if (hint == 0) { | |||
return a1; | |||
PQCLEAN_DILITHIUM2_AVX2_decompose_avx(b, a0, a); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a0[i]); | |||
g = _mm256_load_si256(&b[i]); | |||
h = _mm256_load_si256(&hint[i]); | |||
t = _mm256_blendv_epi32(zero, h, f); | |||
t = _mm256_slli_epi32(t, 1); | |||
h = _mm256_sub_epi32(h, t); | |||
g = _mm256_add_epi32(g, h); | |||
g = _mm256_blendv_epi32(g, max, g); | |||
f = _mm256_cmpgt_epi32(g, max); | |||
g = _mm256_blendv_epi32(g, zero, f); | |||
_mm256_store_si256(&b[i], g); | |||
} | |||
if (a0 > Q) { | |||
return (a1 + 1) & 0xF; | |||
} | |||
return (a1 - 1) & 0xF; | |||
/* If decompose does not divide out ALPHA: | |||
if(hint == 0) | |||
return a1; | |||
else if(a0 > Q) | |||
return (a1 + ALPHA) % (Q - 1); | |||
else | |||
return (a1 - ALPHA) % (Q - 1); | |||
*/ | |||
} |
@@ -1,12 +1,12 @@ | |||
#ifndef ROUNDING_H | |||
#define ROUNDING_H | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0); | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(uint32_t a0, uint32_t a1); | |||
uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(uint32_t a, unsigned int hint); | |||
void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); | |||
void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); | |||
#endif |
@@ -0,0 +1,54 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.text | |||
nttunpack128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
#store | |||
vmovdqa %ymm9,(%rdi) | |||
vmovdqa %ymm8,32(%rdi) | |||
vmovdqa %ymm7,64(%rdi) | |||
vmovdqa %ymm6,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm3,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx): | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
ret |
@@ -9,15 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
vpsllq $32,%ymm\r1,%ymm12 | |||
vpsrlq $32,%ymm\r0,%ymm13 | |||
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 | |||
#vpsllq $32,%ymm\r1,%ymm\r2 | |||
vmovsldup %ymm\r1,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrlq $32,%ymm\r0,%ymm\r0 | |||
#vmovshdup %ymm\r0,%ymm\r0 | |||
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle1 r0,r1,r2,r3 | |||
vpslld $16,%ymm\r1,%ymm12 | |||
vpsrld $16,%ymm\r0,%ymm13 | |||
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 | |||
vpslld $16,%ymm\r1,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrld $16,%ymm\r0,%ymm\r0 | |||
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm |
@@ -1,6 +1,4 @@ | |||
#include <stdint.h> | |||
#include <string.h> | |||
#include "align.h" | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
@@ -9,93 +7,28 @@ | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|i|j). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[4], const uint8_t rho[SEEDBYTES]) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[0].vec[0], | |||
&mat[0].vec[1], | |||
&mat[0].vec[2], | |||
&mat[1].vec[0], | |||
rho, 0, 1, 2, 256); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[1].vec[1], | |||
&mat[1].vec[2], | |||
&mat[2].vec[0], | |||
&mat[2].vec[1], | |||
rho, 257, 258, 512, 513); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[2].vec[2], | |||
&mat[3].vec[0], | |||
&mat[3].vec[1], | |||
&mat[3].vec[2], | |||
rho, 514, 768, 769, 770); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_challenge | |||
* | |||
* Description: Implementation of H. Samples polynomial with 60 nonzero | |||
* coefficients in {-1,1} using the output stream of | |||
* SHAKE256(mu|w1). | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const uint8_t mu[]: byte array containing mu | |||
* - const polyveck *w1: pointer to vector w1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, | |||
const uint8_t mu[CRHBYTES], | |||
const polyveck *w1) { | |||
uint8_t b; | |||
size_t pos; | |||
uint64_t signs; | |||
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; | |||
uint8_t outbuf[SHAKE256_RATE]; | |||
shake256ctx state; | |||
for (size_t i = 0; i < CRHBYTES; ++i) { | |||
inbuf[i] = mu[i]; | |||
} | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); | |||
} | |||
shake256_absorb(&state, inbuf, sizeof(inbuf)); | |||
shake256_squeezeblocks(outbuf, 1, &state); | |||
signs = 0; | |||
for (size_t i = 0; i < 8; ++i) { | |||
signs |= (uint64_t) outbuf[i] << 8 * i; | |||
} | |||
pos = 8; | |||
for (size_t i = 0; i < N; ++i) { | |||
c->coeffs[i] = 0; | |||
} | |||
for (size_t i = 196; i < 256; ++i) { | |||
do { | |||
if (pos >= SHAKE256_RATE) { | |||
shake256_squeezeblocks(outbuf, 1, &state); | |||
pos = 0; | |||
} | |||
b = outbuf[pos++]; | |||
} while (b > i); | |||
c->coeffs[i] = c->coeffs[b]; | |||
c->coeffs[b] = 1; | |||
c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); | |||
signs >>= 1; | |||
static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) { | |||
switch (i) { | |||
case 0: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 1: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
case 2: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho); | |||
*row = buf; | |||
break; | |||
case 3: | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho); | |||
*row = buf + 1; | |||
break; | |||
} | |||
shake256_ctx_release(&state); | |||
} | |||
/************************************************* | |||
@@ -104,56 +37,69 @@ void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes) | |||
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes) | |||
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
unsigned int i; | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
uint8_t tr[CRHBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
uint16_t nonce = 0; | |||
polyvecl mat[K]; | |||
polyvecl s1, s1hat; | |||
polyveck s2, t, t1, t0; | |||
/* Expand 32 bytes of randomness into rho, rhoprime and key */ | |||
randombytes(seedbuf, 3 * SEEDBYTES); | |||
polyvecl rowbuf[2]; | |||
polyvecl s1, *row = rowbuf; | |||
polyveck s2; | |||
poly t1, t0; | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Expand matrix */ | |||
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); | |||
/* Store rho, key */ | |||
memcpy(pk, rho, SEEDBYTES); | |||
memcpy(sk, rho, SEEDBYTES); | |||
memcpy(sk + SEEDBYTES, key, SEEDBYTES); | |||
/* Sample short vectors s1 and s2 */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s2.vec[0], rhoprime, | |||
nonce, nonce + 1, nonce + 2, nonce + 3); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[1], &s2.vec[2], &s2.vec[3], &t.vec[0], rhoprime, | |||
nonce + 4, nonce + 5, nonce + 6, 0); | |||
/* Matrix-vector multiplication */ | |||
s1hat = s1; | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1hat); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); | |||
//PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&t.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&t.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, 4, 5, 6, 7); | |||
/* Pack secret vectors */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); | |||
} | |||
for (i = 0; i < K; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); | |||
} | |||
/* Transform s1 */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); | |||
/* Add error vector s2 */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&t, &t, &s2); | |||
/* Extract t1 and write public key */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&t); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(&t1, &t0, &t); | |||
PQCLEAN_DILITHIUM2_AVX2_pack_pk(pk, rho, &t1); | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
polyvec_matrix_expand_row(&row, rowbuf, rho, i); | |||
/* Compute CRH(rho, t1) and write secret key */ | |||
crh(tr, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
PQCLEAN_DILITHIUM2_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); | |||
/* Compute inner-product */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&t1); | |||
/* Add error polynomial */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&t1, &t1, &s2.vec[i]); | |||
/* Round t and pack t1, t0 */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&t1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); | |||
} | |||
/* Compute CRH(rho, t1) and store in secret key */ | |||
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
return 0; | |||
} | |||
@@ -161,42 +107,40 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature | |||
* | |||
* Description: Compute signed message. | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES | |||
* of len) | |||
* - size_t *siglen: pointer to output length of signed message | |||
* (should be PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk) { | |||
uint32_t n; | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
unsigned int i, n, pos; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint16_t nonce = 0; | |||
poly c, chat; | |||
polyvecl mat[K], s1, y, yhat, z; | |||
polyveck t0, s2, w, w1, w0; | |||
polyveck h, cs2, ct0; | |||
uint8_t hintbuf[N]; | |||
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
uint64_t nonce = 0; | |||
polyvecl mat[K], s1, z; | |||
polyveck t0, s2, w1; | |||
poly c, tmp; | |||
union { | |||
polyvecl y; | |||
polyveck w0; | |||
} tmpv; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); | |||
PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
// use incremental hash API instead of copying around buffers | |||
/* Compute CRH(tr, m) */ | |||
shake256incctx state; | |||
/* Compute CRH(tr, msg) */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
@@ -207,76 +151,88 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &yhat.vec[0], | |||
rhoprime, nonce, nonce + 1, nonce + 2, 0); | |||
nonce += 3; | |||
/* Matrix-vector multiplication */ | |||
yhat = y; | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&yhat); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&w.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3], | |||
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); | |||
nonce += 4; | |||
/* Matrix-vector product */ | |||
tmpv.y = z; | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&tmpv.y); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &w0, &w); | |||
PQCLEAN_DILITHIUM2_AVX2_challenge(&c, mu, &w1); | |||
chat = c; | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat); | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&cs2.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&w0, &w0, &cs2); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&w0); | |||
if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(sig, &w1); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c); | |||
/* Compute z, reject if it reveals secret */ | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&z.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(&z, &z, &y); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(&z); | |||
if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
goto rej; | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&z.vec[i]); | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
} | |||
/* Compute hints for w1 */ | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&ct0.vec[i]); | |||
} | |||
/* Zero hint vector in signature */ | |||
pos = 0; | |||
memset(hint, 0, OMEGA); | |||
for (i = 0; i < K; i++) { | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmpv.w0.vec[i]); | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmp); | |||
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmp, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&ct0); | |||
if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { | |||
goto rej; | |||
PQCLEAN_DILITHIUM2_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
n = PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); | |||
if (pos + n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Store hints in signature */ | |||
memcpy(&hint[pos], hintbuf, n); | |||
hint[OMEGA + i] = pos = pos + n; | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&w0, &w0, &ct0); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w0); | |||
n = PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(&h, &w0, &w1); | |||
if (n > OMEGA) { | |||
goto rej; | |||
/* Pack z into signature */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); | |||
} | |||
/* Write signature */ | |||
PQCLEAN_DILITHIUM2_AVX2_pack_sig(sig, &z, &h, &c); | |||
*siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
@@ -290,63 +246,55 @@ rej: | |||
* array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk) { | |||
int rc; | |||
memmove(sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, m, mlen); | |||
rc = PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return rc; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify | |||
* | |||
* Description: Verify signed message. | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *sig: signature | |||
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) | |||
* - uint8_t *m: pointer to message | |||
* - size_t *mlen: pointer to output length of message | |||
* - uint8_t *pk: pointer to bit-packed public key | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk) { | |||
uint8_t rho[SEEDBYTES]; | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
unsigned int i, j, pos = 0; | |||
/* PQCLEAN_DILITHIUM2_AVX2_polyw1_pack writes additional 14 bytes */ | |||
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; | |||
uint8_t mu[CRHBYTES]; | |||
poly c, chat, cp; | |||
polyvecl mat[K], z; | |||
polyveck t1, w1, h, tmp1, tmp2; | |||
if (siglen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
polyvecl rowbuf[2]; | |||
polyvecl *row = rowbuf; | |||
polyvecl z; | |||
poly c, w1, h; | |||
shake256incctx state; | |||
PQCLEAN_DILITHIUM2_AVX2_unpack_pk(rho, &t1, pk); | |||
if (PQCLEAN_DILITHIUM2_AVX2_unpack_sig(&z, &h, &c, sig)) { | |||
return -1; | |||
} | |||
if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
if (siglen != PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
shake256incctx state; | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
@@ -354,33 +302,69 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
/* Matrix-vector multiplication; compute Az - c2^dt1 */ | |||
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&z); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); | |||
} | |||
/* Expand PQCLEAN_DILITHIUM2_AVX2_challenge */ | |||
PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c); | |||
chat = c; | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(&t1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t1); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); | |||
/* Unpack z; shortness follows from unpacking */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&z.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(&tmp1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(&tmp1); | |||
/* Reconstruct w1 */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&tmp1); | |||
PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(&w1, &tmp1, &h); | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
polyvec_matrix_expand_row(&row, rowbuf, pk, i); | |||
/* Call random oracle and verify challenge */ | |||
PQCLEAN_DILITHIUM2_AVX2_challenge(&cp, mu, &w1); | |||
for (size_t i = 0; i < N; ++i) { | |||
if (c.coeffs[i] != cp.coeffs[i]) { | |||
/* Compute i-th row of Az - c2^Dt1 */ | |||
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); | |||
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&h, &c, &h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&w1); | |||
/* Get hint polynomial and reconstruct w1 */ | |||
memset(h.vec, 0, sizeof(poly)); | |||
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { | |||
return -1; | |||
} | |||
for (j = pos; j < hint[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > pos && hint[j] <= hint[j - 1]) { | |||
return -1; | |||
} | |||
h.coeffs[hint[j]] = 1; | |||
} | |||
pos = hint[OMEGA + i]; | |||
PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = pos; j < OMEGA; ++j) { | |||
if (hint[j]) { | |||
return -1; | |||
} | |||
} | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM2_AVX2_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (buf.coeffs[i] != sig[i]) { | |||
return -1; | |||
} | |||
} | |||
@@ -394,7 +378,7 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
@@ -402,30 +386,28 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk) { | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, | |||
sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) { | |||
*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (size_t i = 0; i < *mlen; ++i) { | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
/* Signature verification failed */ | |||
badsig: | |||
*mlen = (size_t) -1; | |||
for (size_t i = 0; i < smlen; ++i) { | |||
/* Signature verification failed */ | |||
*mlen = -1; | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
@@ -1,15 +1,29 @@ | |||
#ifndef SIGN_H | |||
#define SIGN_H | |||
#include "api.h" | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_SIGN_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_SIGN_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], | |||
const polyveck *w1); | |||
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
#endif | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
#endif |
@@ -1,26 +0,0 @@ | |||
#include "stream.h" | |||
#include <string.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init( | |||
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t buf[SEEDBYTES + 2]; | |||
memcpy(buf, seed, SEEDBYTES); | |||
buf[SEEDBYTES] = (uint8_t)nonce; | |||
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); | |||
shake128_absorb(state, buf, SEEDBYTES + 2); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init( | |||
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t buf[CRHBYTES + 2]; | |||
memcpy(buf, seed, CRHBYTES); | |||
buf[CRHBYTES] = (uint8_t)nonce; | |||
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); | |||
shake256_absorb(state, buf, CRHBYTES + 2); | |||
} |
@@ -1,15 +0,0 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_STREAM_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_STREAM_H | |||
#include <stdint.h> | |||
#include "fips202.h" | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init( | |||
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init( | |||
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
#endif |
@@ -0,0 +1,26 @@ | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake128_inc_init(state); | |||
shake128_inc_absorb(state, seed, SEEDBYTES); | |||
shake128_inc_absorb(state, t, 2); | |||
shake128_inc_finalize(state); | |||
} | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake256_inc_init(state); | |||
shake256_inc_absorb(state, seed, CRHBYTES); | |||
shake256_inc_absorb(state, t, 2); | |||
shake256_inc_finalize(state); | |||
} |
@@ -1,25 +1,36 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H | |||
#define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "stream.h" | |||
#include <stdint.h> | |||
#include "fips202.h" | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) | |||
typedef shake128incctx stream128_state; | |||
typedef shake256incctx stream256_state; | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
#define STREAM128_BLOCKBYTES SHAKE128_RATE | |||
#define STREAM256_BLOCKBYTES SHAKE256_RATE | |||
typedef shake128ctx stream128_state; | |||
typedef shake256ctx stream256_state; | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) | |||
#define stream128_release(STATE) shake128_inc_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) | |||
#define stream256_release(STATE) shake256_inc_ctx_release(STATE) | |||
#endif |
@@ -1,6 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and the random number generator | |||
we are using public-domain code from sources | |||
and by authors listed in comments on top of | |||
the respective files. | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -1,13 +1,10 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libdilithium2_clean.a | |||
HEADERS=api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h | |||
OBJECTS=ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o | |||
SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c | |||
OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o | |||
HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ | |||
reduce.h rounding.h symmetric.h stream.h | |||
CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
@@ -2,8 +2,13 @@ | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libdilithium2_clean.lib | |||
OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX | |||
OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
# long as there has been a C standard, and we need it for constant-time | |||
# computations. Thus, we disable that spurious warning. | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 | |||
all: $(LIBRARY) | |||
@@ -11,7 +16,7 @@ all: $(LIBRARY) | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
@@ -4,26 +4,13 @@ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1184U | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2800U | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2044U | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1312 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2544 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2420 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair( | |||
uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *msg, size_t len, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
@@ -33,6 +20,12 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -1,138 +1,98 @@ | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "ntt.h" | |||
#include "poly.h" | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM2_CLEAN_ntt */ | |||
static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas[N] = { | |||
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, | |||
2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, | |||
1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, | |||
4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, 2706023, | |||
95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, 4519302, | |||
5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, 6681150, | |||
6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, 811944, | |||
531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, 4450022, | |||
6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, 7122806, | |||
1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, 3412210, | |||
7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, 7709315, | |||
7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, 5037034, | |||
264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, 4656075, 8371839, | |||
1653064, 5130689, 2389356, 8169440, 759969, 7063561, 189548, 4827145, | |||
3159746, 6529015, 5971092, 8202977, 1315589, 1341330, 1285669, 6795489, | |||
7567685, 6940675, 5361315, 4499357, 4751448, 3839961, 2091667, 3407706, | |||
2316500, 3817976, 5037939, 2244091, 5933984, 4817955, 266997, 2434439, | |||
7144689, 3513181, 4860065, 4621053, 7183191, 5187039, 900702, 1859098, | |||
909542, 819034, 495491, 6767243, 8337157, 7857917, 7725090, 5257975, | |||
2031748, 3207046, 4823422, 7855319, 7611795, 4784579, 342297, 286988, | |||
5942594, 4108315, 3437287, 5038140, 1735879, 203044, 2842341, 2691481, | |||
5790267, 1265009, 4055324, 1247620, 2486353, 1595974, 4613401, 1250494, | |||
2635921, 4832145, 5386378, 1869119, 1903435, 7329447, 7047359, 1237275, | |||
5062207, 6950192, 7929317, 1312455, 3306115, 6417775, 7100756, 1917081, | |||
5834105, 7005614, 1500165, 777191, 2235880, 3406031, 7838005, 5548557, | |||
6709241, 6533464, 5796124, 4656147, 594136, 4603424, 6366809, 2432395, | |||
2454455, 8215696, 1957272, 3369112, 185531, 7173032, 5196991, 162844, | |||
1616392, 3014001, 810149, 1652634, 4686184, 6581310, 5341501, 3523897, | |||
3866901, 269760, 2213111, 7404533, 1717735, 472078, 7953734, 1723600, | |||
6577327, 1910376, 6712985, 7276084, 8119771, 4546524, 5441381, 6144432, | |||
7959518, 6094090, 183443, 7403526, 1612842, 4834730, 7826001, 3919660, | |||
8332111, 7018208, 3937738, 1400424, 7534263, 1976782 | |||
}; | |||
/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM2_CLEAN_ntt */ | |||
static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[N] = { | |||
6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, | |||
3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, | |||
3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, | |||
7908339, 6662682, 975884, 6167306, 8110657, 4513516, 4856520, 3038916, | |||
1799107, 3694233, 6727783, 7570268, 5366416, 6764025, 8217573, 3183426, | |||
1207385, 8194886, 5011305, 6423145, 164721, 5925962, 5948022, 2013608, | |||
3776993, 7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412, | |||
4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336, 1279661, | |||
1962642, 5074302, 7067962, 451100, 1430225, 3318210, 7143142, 1333058, | |||
1050970, 6476982, 6511298, 2994039, 3548272, 5744496, 7129923, 3767016, | |||
6784443, 5894064, 7132797, 4325093, 7115408, 2590150, 5688936, 5538076, | |||
8177373, 6644538, 3342277, 4943130, 4272102, 2437823, 8093429, 8038120, | |||
3595838, 768622, 525098, 3556995, 5173371, 6348669, 3122442, 655327, | |||
522500, 43260, 1613174, 7884926, 7561383, 7470875, 6521319, 7479715, | |||
3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978, 8113420, | |||
3562462, 2446433, 6136326, 3342478, 4562441, 6063917, 4972711, 6288750, | |||
4540456, 3628969, 3881060, 3019102, 1439742, 812732, 1584928, 7094748, | |||
7039087, 7064828, 177440, 2409325, 1851402, 5220671, 3553272, 8190869, | |||
1316856, 7620448, 210977, 5991061, 3249728, 6727353, 8578, 3724342, | |||
4421799, 7475901, 1100098, 8336129, 5282425, 7871466, 8115473, 3343383, | |||
1430430, 6527646, 7031341, 381987, 1308169, 22981, 1228525, 671102, | |||
2477047, 411027, 3693493, 2967645, 5665122, 6232521, 983419, 4968207, | |||
8253495, 3632928, 3157330, 3190144, 1000202, 4083598, 6441103, 1257611, | |||
1585221, 6203962, 4904467, 1452451, 3041255, 3677745, 1528703, 3930395, | |||
2797779, 6308525, 2556880, 4479693, 4499374, 7426187, 7849063, 7568473, | |||
4680821, 1600420, 2140649, 4873154, 3821735, 4874723, 1643818, 1699267, | |||
539299, 6031717, 300467, 4840449, 2867647, 4805995, 3043716, 3861115, | |||
4464978, 2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394, | |||
8100412, 4369920, 19422, 6623180, 3277672, 1399561, 3859737, 2118186, | |||
2108549, 5760665, 1119584, 549488, 4794489, 1079900, 7356305, 5654953, | |||
5700314, 5268920, 2884855, 5260684, 2091905, 359251, 6026966, 6554070, | |||
7913949, 876248, 777960, 8143293, 518909, 2608894, 8354570 | |||
static const int32_t zetas[N] = { | |||
0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, | |||
2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, | |||
-2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, | |||
-1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, | |||
3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, | |||
-671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, | |||
-3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, | |||
-3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, | |||
189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, | |||
1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, | |||
2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, | |||
266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, | |||
900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, | |||
-655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, | |||
342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, | |||
2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, | |||
-3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, | |||
-1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, | |||
-1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, | |||
-542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, | |||
-2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, | |||
-3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, | |||
-3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, | |||
-426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, | |||
-2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, | |||
-554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 | |||
}; | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_ntt | |||
* | |||
* Description: Forward NTT, in-place. No modular reduction is performed after | |||
* additions or subtractions. Hence output coefficients can be up | |||
* to 16*Q larger than the coefficients of the input polynomial. | |||
* Output vector is in bitreversed order. | |||
* additions or subtractions. Output vector is in bitreversed order. | |||
* | |||
* Arguments: - uint32_t p[N]: input/output coefficient array | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) { | |||
size_t k, j; | |||
uint32_t zeta, t; | |||
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]) { | |||
unsigned int len, start, j, k; | |||
int32_t zeta, t; | |||
k = 1; | |||
for (size_t len = 128; len > 0; len >>= 1) { | |||
for (size_t start = 0; start < N; start = j + len) { | |||
zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas[k++]; | |||
k = 0; | |||
for (len = 128; len > 0; len >>= 1) { | |||
for (start = 0; start < N; start = j + len) { | |||
zeta = zetas[++k]; | |||
for (j = start; j < start + len; ++j) { | |||
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); | |||
p[j + len] = p[j] + 2 * Q - t; | |||
p[j] = p[j] + t; | |||
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); | |||
a[j + len] = a[j] - t; | |||
a[j] = a[j] + t; | |||
} | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by Montgomery factor 2^32. | |||
* In-place. No modular reductions after additions or | |||
* subtractions. Input coefficient need to be smaller than 2*Q. | |||
* Output coefficient are smaller than 2*Q. | |||
* subtractions; input coefficients need to be smaller than | |||
* Q in absolute value. Output coefficient are smaller than Q in | |||
* absolute value. | |||
* | |||
* Arguments: - uint32_t p[N]: input/output coefficient array | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) { | |||
size_t start, len, j, k; | |||
uint32_t t, zeta; | |||
const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; | |||
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]) { | |||
unsigned int start, len, j, k; | |||
int32_t t, zeta; | |||
const int32_t f = 41978; // mont^2/256 | |||
k = 0; | |||
k = 256; | |||
for (len = 1; len < N; len <<= 1) { | |||
for (start = 0; start < N; start = j + len) { | |||
zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[k++]; | |||
zeta = -zetas[--k]; | |||
for (j = start; j < start + len; ++j) { | |||
t = p[j]; | |||
p[j] = t + p[j + len]; | |||
p[j + len] = t + 256 * Q - p[j + len]; | |||
p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); | |||
t = a[j]; | |||
a[j] = t + a[j + len]; | |||
a[j + len] = t - a[j + len]; | |||
a[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); | |||
} | |||
} | |||
} | |||
for (j = 0; j < N; ++j) { | |||
p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) f * p[j]); | |||
a[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)f * a[j]); | |||
} | |||
} |
@@ -1,11 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_NTT_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]); | |||
#endif |
@@ -3,6 +3,7 @@ | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk | |||
* | |||
@@ -12,17 +13,18 @@ | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( | |||
uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
@@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk( | |||
uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t key[]: byte array containing key | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
* - const polyveck *t0: pointer to vector t0 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk( | |||
uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const polyvecl *s1, | |||
const polyveck *s2, | |||
const polyveck *t0) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < CRHBYTES; ++i) { | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLETA_SIZE_PACKED; | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLETA_SIZE_PACKED; | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t key[]: output byte array for key | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - const polyveck *r0: pointer to output vector t0 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk( | |||
uint8_t rho[SEEDBYTES], | |||
uint8_t key[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
polyvecl *s1, | |||
polyveck *s2, | |||
polyveck *t0, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) { | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < SEEDBYTES; ++i) { | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (size_t i = 0; i < CRHBYTES; ++i) { | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLETA_SIZE_PACKED; | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLETA_SIZE_PACKED; | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (z, h, c). | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
* - const poly *c: pointer to challenge polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig( | |||
uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const polyvecl *z, | |||
const polyveck *h, | |||
const poly *c) { | |||
size_t k; | |||
uint64_t signs, mask; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += L * POLZ_SIZE_PACKED; | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (size_t i = 0; i < K; ++i) { | |||
for (size_t j = 0; j < N; ++j) { | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t)j; | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t)k; | |||
} | |||
while (k < OMEGA) { | |||
sig[k++] = 0; | |||
} | |||
sig += OMEGA + K; | |||
/* Encode c */ | |||
signs = 0; | |||
mask = 1; | |||
for (size_t i = 0; i < N / 8; ++i) { | |||
sig[i] = 0; | |||
for (size_t j = 0; j < 8; ++j) { | |||
if (c->coeffs[8 * i + j] != 0) { | |||
sig[i] |= (uint8_t)(1u << j); | |||
if (c->coeffs[8 * i + j] == (Q - 1)) { | |||
signs |= mask; | |||
} | |||
mask <<= 1; | |||
} | |||
} | |||
} | |||
sig += N / 8; | |||
for (size_t i = 0; i < 8; ++i) { | |||
sig[i] = (uint8_t)(signs >> 8u * i); | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (z, h, c). | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - polyvecl *z: pointer to output vector z | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - poly *c: pointer to output challenge polynomial | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( | |||
polyvecl *z, | |||
polyveck *h, | |||
poly *c, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) { | |||
size_t k; | |||
uint64_t signs; | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); | |||
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += L * POLZ_SIZE_PACKED; | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (size_t i = 0; i < K; ++i) { | |||
for (size_t j = 0; j < N; ++j) { | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
@@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( | |||
return 1; | |||
} | |||
for (size_t j = k; j < sig[OMEGA + i]; ++j) { | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
@@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (size_t j = k; j < OMEGA; ++j) { | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
sig += OMEGA + K; | |||
/* Decode c */ | |||
for (size_t i = 0; i < N; ++i) { | |||
c->coeffs[i] = 0; | |||
} | |||
signs = 0; | |||
for (size_t i = 0; i < 8; ++i) { | |||
signs |= (uint64_t)sig[N / 8 + i] << 8 * i; | |||
} | |||
/* Extra sign bits are zero for strong unforgeability */ | |||
if (signs >> 60) { | |||
return 1; | |||
} | |||
for (size_t i = 0; i < N / 8; ++i) { | |||
for (size_t j = 0; j < 8; ++j) { | |||
if ((sig[i] >> j) & 0x01) { | |||
c->coeffs[8 * i + j] = 1; | |||
c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); | |||
signs >>= 1; | |||
} | |||
} | |||
} | |||
return 0; | |||
} |
@@ -1,42 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H | |||
#include "api.h" | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( | |||
uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk( | |||
uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const uint8_t tr[SEEDBYTES], | |||
const polyvecl *s1, | |||
const polyveck *s2, | |||
const polyveck *t0); | |||
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig( | |||
uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], | |||
const polyvecl *z, const polyveck *h, const poly *c); | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk( | |||
uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk( | |||
uint8_t rho[SEEDBYTES], | |||
uint8_t key[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
polyvecl *s1, | |||
polyveck *s2, | |||
polyveck *t0, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( | |||
polyvecl *z, | |||
polyveck *h, | |||
poly *c, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]); | |||
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]); | |||
#endif |
@@ -2,28 +2,40 @@ | |||
#define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define QBITS 23 | |||
#define D 14 | |||
#define GAMMA1 ((Q - 1)/16) | |||
#define GAMMA2 (GAMMA1/2) | |||
#define ALPHA (2*GAMMA2) | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 4 | |||
#define L 3 | |||
#define ETA 6 | |||
#define SETABITS 4 | |||
#define BETA 325 | |||
#define L 4 | |||
#define ETA 2 | |||
#define TAU 39 | |||
#define BETA 78 | |||
#define GAMMA1 (1 << 17) | |||
#define GAMMA2 ((Q-1)/88) | |||
#define OMEGA 80 | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 576 | |||
#define POLYW1_PACKEDBYTES 192 | |||
#define POLYETA_PACKEDBYTES 96 | |||
#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) | |||
#define POLT0_SIZE_PACKED ((N*D)/8) | |||
#define POLETA_SIZE_PACKED ((N*SETABITS)/8) | |||
#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) | |||
#define POLW1_SIZE_PACKED ((N*4)/8) | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -1,53 +1,40 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_POLY_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef struct { | |||
uint32_t coeffs[N]; | |||
int32_t coeffs[N]; | |||
} poly; | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_add( | |||
poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub( | |||
poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery( | |||
poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round( | |||
poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose( | |||
poly *a1, poly *a0, const poly *a); | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint( | |||
poly *h, const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint( | |||
poly *a, const poly *b, const poly *h); | |||
int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm( | |||
const poly *a, uint32_t B); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform( | |||
poly *a, | |||
const uint8_t *seed, | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta( | |||
poly *a, | |||
const uint8_t *seed, | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1( | |||
poly *a, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); | |||
@@ -1,14 +1,65 @@ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
unsigned int i, j; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < L; ++j) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze | |||
* | |||
@@ -18,7 +69,9 @@ | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) { | |||
for (size_t i = 0; i < L; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
@@ -33,9 +86,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) { | |||
* - const polyvecl *u: pointer to first summand | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add( | |||
polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
for (size_t i = 0; i < L; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -49,32 +103,49 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add( | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) { | |||
for (size_t i = 0; i < L; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* Input coefficients are assumed to be less than 22*Q. Output | |||
* coeffcient are less than 2*L*Q. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery( | |||
poly *w, const polyvecl *u, const polyvecl *v) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v) { | |||
unsigned int i; | |||
poly t; | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(w, &u->vec[0], &v->vec[0]); | |||
for (size_t i = 1; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&t, &u->vec[i], &v->vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); | |||
for (i = 1; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_add(w, w, &t); | |||
} | |||
} | |||
@@ -83,17 +154,19 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery( | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input coefficients to be standard representatives. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - uint32_t B: norm bound | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B and 1 | |||
* otherwise. | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { | |||
for (size_t i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) { | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
@@ -105,32 +178,43 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [0,2*Q[. | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* subtract Q if coefficient is bigger than Q. | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(&v->vec[i]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
@@ -143,7 +227,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) { | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
@@ -158,9 +244,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) { | |||
* - const polyveck *u: pointer to first summand | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add( | |||
polyveck *w, const polyveck *u, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -169,17 +256,17 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add( | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* Assumes coefficients of polynomials in second input vector | |||
* to be less than 2*Q. No modular reduction is performed. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
* - const polyveck *v: pointer to second input vector to be | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub( | |||
polyveck *w, const polyveck *u, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -188,12 +275,14 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub( | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{32-D}. | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
@@ -207,13 +296,15 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) { | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
@@ -221,27 +312,40 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) { | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&v->vec[i]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input coefficients to be standard representatives. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - uint32_t B: norm bound | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B and 1 | |||
* otherwise. | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { | |||
for (size_t i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) { | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
@@ -253,19 +357,20 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod Q = a1*2^D + a0 | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients Q + a0 | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( | |||
polyveck *v1, polyveck *v0, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -274,7 +379,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
@@ -282,12 +387,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients Q + a0 | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose( | |||
polyveck *v1, polyveck *v0, const polyveck *v) { | |||
for (size_t i = 0; i < K; ++i) { | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
@@ -303,15 +409,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose( | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint( | |||
polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1) { | |||
uint32_t s = 0; | |||
for (size_t i = 0; i < K; ++i) { | |||
s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint( | |||
&h->vec[i], &v0->vec[i], &v1->vec[i]); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1) { | |||
unsigned int i, s = 0; | |||
for (i = 0; i < K; ++i) { | |||
s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return s; | |||
@@ -324,13 +428,21 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint( | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *v: pointer to input vector | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint( | |||
polyveck *w, const polyveck *v, const polyveck *h) { | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint( | |||
&w->vec[i], &v->vec[i], &h->vec[i]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -1,25 +1,33 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery( | |||
poly *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B); | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
@@ -28,31 +36,33 @@ typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add( | |||
polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub( | |||
polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v); | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm( | |||
const polyveck *v, uint32_t B); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( | |||
polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose( | |||
polyveck *v1, polyveck *v0, const polyveck *v); | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint( | |||
polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint( | |||
polyveck *w, const polyveck *v, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, | |||
const polyveck *v0, | |||
const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -1,60 +1,54 @@ | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce | |||
* | |||
* Description: For finite field element a with 0 <= a <= Q*2^32, | |||
* compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. | |||
* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, | |||
* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. | |||
* | |||
* Arguments: - uint64_t: finite field element a | |||
* Arguments: - int64_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a) { | |||
uint64_t t; | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a) { | |||
int32_t t; | |||
t = a * QINV; | |||
t &= (1ULL << 32) - 1; | |||
t *= Q; | |||
t = a + t; | |||
t >>= 32; | |||
return (uint32_t)t; | |||
t = (int32_t)((uint64_t)a * (uint64_t)QINV); | |||
t = (a - (int64_t)t * Q) >> 32; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32 | |||
* | |||
* Description: For finite field element a, compute r \equiv a (mod Q) | |||
* such that 0 <= r < 2*Q. | |||
* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, | |||
* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. | |||
* | |||
* Arguments: - uint32_t: finite field element a | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a) { | |||
uint32_t t; | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a) { | |||
int32_t t; | |||
t = a & 0x7FFFFF; | |||
a >>= 23; | |||
t += (a << 13) - a; | |||
t = (a + (1 << 22)) >> 23; | |||
t = a - t * Q; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_csubq | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_caddq | |||
* | |||
* Description: Subtract Q if input coefficient is bigger than Q. | |||
* Description: Add Q if input coefficient is negative. | |||
* | |||
* Arguments: - uint32_t: finite field element a | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) { | |||
a -= Q; | |||
a += ((int32_t)a >> 31) & Q; | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a) { | |||
a += (a >> 31) & Q; | |||
return a; | |||
} | |||
@@ -62,14 +56,14 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) { | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_freeze | |||
* | |||
* Description: For finite field element a, compute standard | |||
* representative r = a mod Q. | |||
* representative r = a mod^+ Q. | |||
* | |||
* Arguments: - uint32_t: finite field element a | |||
* Arguments: - int32_t: finite field element a | |||
* | |||
* Returns r. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(uint32_t a) { | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a) { | |||
a = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a); | |||
a = PQCLEAN_DILITHIUM2_CLEAN_csubq(a); | |||
a = PQCLEAN_DILITHIUM2_CLEAN_caddq(a); | |||
return a; | |||
} |
@@ -1,21 +1,17 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define MONT 4193792U // 2^32 % Q | |||
#define QINV 4236238847U // -q^(-1) mod 2^32 | |||
#define MONT (-4186625) // 2^32 % Q | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
/* a <= Q*2^32 => r < 2*Q */ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a); | |||
/* r < 2*Q */ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a); | |||
/* a < 2*Q => r < Q */ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a); | |||
/* r < Q */ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(uint32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a); | |||
#endif |
@@ -1,86 +1,70 @@ | |||
#include "params.h" | |||
#include "rounding.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_power2round | |||
* | |||
* Description: For finite field element a, compute a0, a1 such that | |||
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be standard representative. | |||
* | |||
* Arguments: - uint32_t a: input element | |||
* - uint32_t *a0: pointer to output element Q + a0 | |||
* Arguments: - int32_t a: input element | |||
* - int32_t *a0: pointer to output element a0 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) { | |||
uint32_t t; | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a) { | |||
int32_t a1; | |||
/* Centralized remainder mod 2^D */ | |||
t = a & ((1U << D) - 1); | |||
t -= (1U << (D - 1)) + 1; | |||
t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); | |||
t -= (1U << (D - 1)) - 1; | |||
*a0 = Q + t; | |||
a = (a - t) >> D; | |||
return a; | |||
a1 = (a + (1 << (D - 1)) - 1) >> D; | |||
*a0 = a - (a1 << D); | |||
return a1; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_decompose | |||
* | |||
* Description: For finite field element a, compute high and low bits a0, a1 such | |||
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard | |||
* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard | |||
* representative. | |||
* | |||
* Arguments: - uint32_t a: input element | |||
* - uint32_t *a0: pointer to output element Q + a0 | |||
* Arguments: - int32_t a: input element | |||
* - int32_t *a0: pointer to output element a0 | |||
* | |||
* Returns a1. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) { | |||
int32_t t, u; | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) { | |||
int32_t a1; | |||
/* Centralized remainder mod ALPHA */ | |||
t = a & 0x7FFFFu; | |||
t += (int32_t)((a >> 19u) << 9u); | |||
t -= ALPHA / 2 + 1; | |||
t += (t >> 31) & ALPHA; | |||
t -= ALPHA / 2 - 1; | |||
a -= (uint32_t)t; | |||
a1 = (a + 127) >> 7; | |||
a1 = (a1 * 11275 + (1 << 23)) >> 24; | |||
a1 ^= ((43 - a1) >> 31) & a1; | |||
/* Divide by ALPHA (possible to avoid) */ | |||
u = (int32_t)(a - 1); | |||
u >>= 31; | |||
a = (a >> 19) + 1; | |||
a -= u & 1; | |||
/* Border case */ | |||
*a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); | |||
a &= 0xFu; | |||
return a; | |||
*a0 = a - a1 * 2 * GAMMA2; | |||
*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; | |||
return a1; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint | |||
* | |||
* Description: Compute hint bit indicating whether the low bits of the | |||
* input element overflow into the high bits. Inputs assumed to be | |||
* standard representatives. | |||
* input element overflow into the high bits. | |||
* | |||
* Arguments: - uint32_t a0: low bits of input element | |||
* - uint32_t a1: high bits of input element | |||
* Arguments: - int32_t a0: low bits of input element | |||
* - int32_t a1: high bits of input element | |||
* | |||
* Returns 1 if high bits of a and b differ and 0 otherwise. | |||
* Returns 1 if overflow. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { | |||
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { | |||
return 0; | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1) { | |||
if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { | |||
return 1; | |||
} | |||
return 1; | |||
return 0; | |||
} | |||
/************************************************* | |||
@@ -88,30 +72,27 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { | |||
* | |||
* Description: Correct high bits according to hint. | |||
* | |||
* Arguments: - uint32_t a: input element | |||
* Arguments: - int32_t a: input element | |||
* - unsigned int hint: hint bit | |||
* | |||
* Returns corrected high bits. | |||
**************************************************/ | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) { | |||
uint32_t a0, a1; | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint) { | |||
int32_t a0, a1; | |||
a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(a, &a0); | |||
a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0, a); | |||
if (hint == 0) { | |||
return a1; | |||
} | |||
if (a0 > Q) { | |||
return (a1 + 1) & 0xF; | |||
} | |||
return (a1 - 1) & 0xF; | |||
/* If PQCLEAN_DILITHIUM2_CLEAN_decompose does not divide out ALPHA: | |||
if(hint == 0) | |||
return a1; | |||
else if(a0 > Q) | |||
return (a1 + ALPHA) % (Q - 1); | |||
else | |||
return (a1 - ALPHA) % (Q - 1); | |||
*/ | |||
if (a0 > 0) { | |||
if (a1 == 43) { | |||
return 0; | |||
} | |||
return a1 + 1; | |||
} | |||
if (a1 == 0) { | |||
return 43; | |||
} | |||
return a1 - 1; | |||
} |
@@ -1,11 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0); | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1); | |||
uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a); | |||
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1); | |||
int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint); | |||
#endif |
@@ -1,6 +1,3 @@ | |||
#include <stdint.h> | |||
#include <string.h> | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
@@ -9,84 +6,7 @@ | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|i|j). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
for (size_t i = 0; i < K; ++i) { | |||
for (size_t j = 0; j < L; ++j) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t)((i << 8) + j)); | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge | |||
* | |||
* Description: Implementation of H. Samples polynomial with 60 nonzero | |||
* coefficients in {-1,1} using the output stream of | |||
* SHAKE256(mu|w1). | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const uint8_t mu[]: byte array containing mu | |||
* - const polyveck *w1: pointer to vector w1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, | |||
const uint8_t mu[CRHBYTES], | |||
const polyveck *w1) { | |||
uint64_t signs; | |||
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; | |||
uint8_t outbuf[SHAKE256_RATE]; | |||
shake256ctx state; | |||
uint8_t b; | |||
size_t pos; | |||
for (size_t i = 0; i < CRHBYTES; ++i) { | |||
inbuf[i] = mu[i]; | |||
} | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); | |||
} | |||
shake256_absorb(&state, inbuf, sizeof(inbuf)); | |||
shake256_squeezeblocks(outbuf, 1, &state); | |||
signs = 0; | |||
for (size_t i = 0; i < 8; ++i) { | |||
signs |= (uint64_t)outbuf[i] << 8 * i; | |||
} | |||
pos = 8; | |||
for (size_t i = 0; i < N; ++i) { | |||
c->coeffs[i] = 0; | |||
} | |||
for (size_t i = 196; i < 256; ++i) { | |||
do { | |||
if (pos >= SHAKE256_RATE) { | |||
shake256_squeezeblocks(outbuf, 1, &state); | |||
pos = 0; | |||
} | |||
b = outbuf[pos++]; | |||
} while (b > i); | |||
c->coeffs[i] = c->coeffs[b]; | |||
c->coeffs[b] = 1; | |||
c->coeffs[b] ^= -((int32_t)signs & 1) & (1 ^ (Q - 1)); | |||
signs >>= 1; | |||
} | |||
shake256_ctx_release(&state); | |||
} | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair | |||
@@ -94,9 +14,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) | |||
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes) | |||
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
@@ -104,48 +24,42 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
uint8_t tr[CRHBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
uint16_t nonce = 0; | |||
polyvecl mat[K]; | |||
polyvecl s1, s1hat; | |||
polyveck s2, t, t1, t0; | |||
polyveck s2, t1, t0; | |||
/* Expand 32 bytes of randomness into rho, rhoprime and key */ | |||
randombytes(seedbuf, 3 * SEEDBYTES); | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Expand matrix */ | |||
PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); | |||
/* Sample short vectors s1 and s2 */ | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&s1.vec[i], rhoprime, nonce++); | |||
} | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&s2.vec[i], rhoprime, nonce++); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); | |||
/* Matrix-vector multiplication */ | |||
s1hat = s1; | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1hat); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&t.vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&t.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&t1); | |||
/* Add error vector s2 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t, &t, &s2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t1, &t1, &s2); | |||
/* Extract t1 and write public key */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(&t); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1); | |||
/* Compute CRH(rho, t1) and write secret key */ | |||
crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); | |||
return 0; | |||
} | |||
@@ -153,44 +67,41 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature | |||
* | |||
* Description: Compute signed message. | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES | |||
* of len) | |||
* - size_t *smlen: pointer to output length of signed message | |||
* (should be PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *msg, size_t mlen, | |||
const uint8_t *sk) { | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, | |||
size_t *siglen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *sk) { | |||
unsigned int n; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint32_t n; | |||
uint16_t nonce = 0; | |||
poly c, chat; | |||
polyvecl mat[K], s1, y, yhat, z; | |||
polyveck t0, s2, w, w1, w0; | |||
polyveck h, cs2, ct0; | |||
polyvecl mat[K], s1, y, z; | |||
polyveck t0, s2, w1, w0, h; | |||
poly cp; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); | |||
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
// use incremental hash API instead of copying around buffers | |||
/* Compute CRH(tr, msg) */ | |||
shake256incctx state; | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, msg, mlen); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
@@ -198,76 +109,71 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(&y.vec[i], rhoprime, nonce++); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); | |||
/* Matrix-vector multiplication */ | |||
yhat = y; | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&yhat); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&w.vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&w.vec[i]); | |||
} | |||
z = y; | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&w); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w); | |||
PQCLEAN_DILITHIUM2_CLEAN_challenge(&c, mu, &w1); | |||
chat = c; | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&chat); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(sig, &w1); | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&cs2.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &cs2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(&w0); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, sig); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp); | |||
/* Compute z, reject if it reveals secret */ | |||
for (size_t i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&z.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(&z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(&z, &z, &y); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(&z); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(&z); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints for w1 */ | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&ct0.vec[i]); | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w0); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&ct0); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&ct0, GAMMA2)) { | |||
/* Compute hints for w1 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&h); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&h, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &ct0); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&w0); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &h); | |||
n = PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(&h, &w0, &w1); | |||
if (n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Write signature */ | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, &z, &h, &c); | |||
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, sig, &z, &h); | |||
*siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
@@ -281,53 +187,63 @@ rej: | |||
* array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk) { | |||
int rc; | |||
memmove(sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, m, mlen); | |||
rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, | |||
size_t *smlen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return rc; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify | |||
* | |||
* Description: Verify signed message. | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *sig: signature | |||
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) | |||
* - uint8_t *m: pointer to message | |||
* - size_t *mlen: pointer to output length of message | |||
* - uint8_t *pk: pointer to bit-packed public key | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, | |||
size_t siglen, | |||
const uint8_t *m, | |||
size_t mlen, | |||
const uint8_t *pk) { | |||
unsigned int i; | |||
uint8_t buf[K * POLYW1_PACKEDBYTES]; | |||
uint8_t rho[SEEDBYTES]; | |||
uint8_t mu[CRHBYTES]; | |||
poly c, chat, cp; | |||
uint8_t c[SEEDBYTES]; | |||
uint8_t c2[SEEDBYTES]; | |||
poly cp; | |||
polyvecl mat[K], z; | |||
polyveck t1, w1, h, tmp1, tmp2; | |||
polyveck t1, w1, h; | |||
shake256incctx state; | |||
if (siglen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { | |||
if (siglen != PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(rho, &t1, pk); | |||
if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(&z, &h, &c, sig)) { | |||
if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(c, &z, &h, sig)) { | |||
return -1; | |||
} | |||
if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { | |||
@@ -336,8 +252,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); | |||
shake256incctx state; | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
@@ -346,38 +260,39 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
shake256_inc_ctx_release(&state); | |||
/* Matrix-vector multiplication; compute Az - c2^dt1 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, c); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z); | |||
for (size_t i = 0; i < K ; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); | |||
chat = c; | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&chat); | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(&t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t1); | |||
for (size_t i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&tmp1, &tmp1, &tmp2); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&tmp1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(&tmp1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w1, &w1, &t1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1); | |||
/* Reconstruct w1 */ | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&tmp1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &tmp1, &h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(buf, &w1); | |||
/* Call random oracle and verify challenge */ | |||
PQCLEAN_DILITHIUM2_CLEAN_challenge(&cp, mu, &w1); | |||
for (size_t i = 0; i < N; ++i) { | |||
if (c.coeffs[i] != cp.coeffs[i]) { | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM2_CLEAN_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(c2, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (c[i] != c2[i]) { | |||
return -1; | |||
} | |||
} | |||
// All good | |||
return 0; | |||
} | |||
@@ -387,7 +302,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
@@ -395,33 +310,34 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk) { | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, | |||
size_t *mlen, | |||
const uint8_t *sm, | |||
size_t smlen, | |||
const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, | |||
sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) { | |||
*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (size_t i = 0; i < *mlen; ++i) { | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
/* Signature verification failed */ | |||
badsig: | |||
/* Signature verification failed */ | |||
*mlen = (size_t) -1; | |||
for (size_t i = 0; i < smlen; ++i) { | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
return -1; | |||
} | |||
@@ -1,12 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H | |||
#include "api.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], | |||
const polyveck *w1); | |||
#endif |
@@ -1,26 +0,0 @@ | |||
#include "stream.h" | |||
#include <string.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init( | |||
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t buf[SEEDBYTES + 2]; | |||
memcpy(buf, seed, SEEDBYTES); | |||
buf[SEEDBYTES] = (uint8_t)nonce; | |||
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); | |||
shake128_absorb(state, buf, SEEDBYTES + 2); | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init( | |||
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t buf[CRHBYTES + 2]; | |||
memcpy(buf, seed, CRHBYTES); | |||
buf[CRHBYTES] = (uint8_t)nonce; | |||
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); | |||
shake256_absorb(state, buf, CRHBYTES + 2); | |||
} |
@@ -1,15 +0,0 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_STREAM_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_STREAM_H | |||
#include <stdint.h> | |||
#include "fips202.h" | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init( | |||
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init( | |||
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
#endif |
@@ -0,0 +1,26 @@ | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake128_inc_init(state); | |||
shake128_inc_absorb(state, seed, SEEDBYTES); | |||
shake128_inc_absorb(state, t, 2); | |||
shake128_inc_finalize(state); | |||
} | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
uint8_t t[2]; | |||
t[0] = (uint8_t) nonce; | |||
t[1] = (uint8_t) (nonce >> 8); | |||
shake256_inc_init(state); | |||
shake256_inc_absorb(state, seed, CRHBYTES); | |||
shake256_inc_absorb(state, t, 2); | |||
shake256_inc_finalize(state); | |||
} |
@@ -1,25 +1,36 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H | |||
#define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "stream.h" | |||
#include <stdint.h> | |||
#include "fips202.h" | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) | |||
typedef shake128incctx stream128_state; | |||
typedef shake256incctx stream256_state; | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, | |||
const uint8_t seed[SEEDBYTES], | |||
uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, | |||
const uint8_t seed[CRHBYTES], | |||
uint16_t nonce); | |||
#define STREAM128_BLOCKBYTES SHAKE128_RATE | |||
#define STREAM256_BLOCKBYTES SHAKE256_RATE | |||
typedef shake128ctx stream128_state; | |||
typedef shake256ctx stream256_state; | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) | |||
#define stream128_release(STATE) shake128_inc_ctx_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) \ | |||
PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) | |||
#define stream256_release(STATE) shake256_inc_ctx_release(STATE) | |||
#endif |
@@ -0,0 +1,31 @@ | |||
name: Dilithium2-AES | |||
type: signature | |||
claimed-nist-level: 2 | |||
length-public-key: 1312 | |||
length-secret-key: 2544 | |||
length-signature: 2420 | |||
nistkat-sha256: 23972a0a5f1f32781aa11fa57d9994ddd53c1bbcc732967f61d9d9aaef01c492 | |||
testvectors-sha256: 22e68fe8bf781dee949a4297f9ba44d1c350a1d88bae03117cfb2ca494c6e604 | |||
principal-submitters: | |||
- Vadim Lyubashevsky | |||
auxiliary-submitters: | |||
- Léo Ducas | |||
- Eike Kiltz | |||
- Tancrède Lepoint | |||
- Peter Schwabe | |||
- Gregor Seiler | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- aes | |||
- avx2 | |||
- popcnt |
@@ -0,0 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -0,0 +1,23 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libdilithium2aes_avx2.a | |||
HEADERS=aes256ctr.h align.h api.h cdecl.h consts.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc | |||
OBJECTS=aes256ctr.o consts.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o invntt.o ntt.o pointwise.o shuffle.o | |||
CFLAGS=-mavx2 -maes -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ | |||
-Wmissing-prototypes -Wredundant-decls -std=c99 \ | |||
-I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
%.o: %.S $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,142 @@ | |||
#include "aes256ctr.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/* Based heavily on public-domain code by Romain Dolbeau | |||
* Different handling of nonce+counter than original version using | |||
* separated 64-bit nonce and internal 64-bit counter, starting from zero | |||
* Public Domain */ | |||
static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { | |||
__m128i f, f0, f1, f2, f3; | |||
const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); | |||
/* Load current counter value */ | |||
f = _mm_load_si128(n); | |||
/* Increase counter in 4 consecutive blocks */ | |||
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); | |||
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); | |||
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); | |||
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); | |||
/* Write counter for next iteration, increased by 4 */ | |||
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); | |||
/* Actual AES encryption, 4x interleaved */ | |||
f = _mm_load_si128(&rkeys[0]); | |||
f0 = _mm_xor_si128(f0, f); | |||
f1 = _mm_xor_si128(f1, f); | |||
f2 = _mm_xor_si128(f2, f); | |||
f3 = _mm_xor_si128(f3, f); | |||
for (int i = 1; i < 14; i++) { | |||
f = _mm_load_si128(&rkeys[i]); | |||
f0 = _mm_aesenc_si128(f0, f); | |||
f1 = _mm_aesenc_si128(f1, f); | |||
f2 = _mm_aesenc_si128(f2, f); | |||
f3 = _mm_aesenc_si128(f3, f); | |||
} | |||
f = _mm_load_si128(&rkeys[14]); | |||
f0 = _mm_aesenclast_si128(f0, f); | |||
f1 = _mm_aesenclast_si128(f1, f); | |||
f2 = _mm_aesenclast_si128(f2, f); | |||
f3 = _mm_aesenclast_si128(f3, f); | |||
/* Write results */ | |||
_mm_storeu_si128((__m128i *)(out + 0), f0); | |||
_mm_storeu_si128((__m128i *)(out + 16), f1); | |||
_mm_storeu_si128((__m128i *)(out + 32), f2); | |||
_mm_storeu_si128((__m128i *)(out + 48), f3); | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { | |||
__m128i key0, key1, temp0, temp1, temp2, temp4; | |||
int idx = 0; | |||
key0 = _mm_loadu_si128((__m128i *)(key + 0)); | |||
key1 = _mm_loadu_si128((__m128i *)(key + 16)); | |||
state->n = _mm_loadl_epi64((__m128i *)&nonce); | |||
state->rkeys[idx++] = key0; | |||
temp0 = key0; | |||
temp2 = key1; | |||
temp4 = _mm_setzero_si128(); | |||
#define BLOCK1(IMM) \ | |||
temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \ | |||
state->rkeys[idx++] = temp2; \ | |||
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x10); \ | |||
temp0 = _mm_xor_si128(temp0, temp4); \ | |||
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x8c); \ | |||
temp0 = _mm_xor_si128(temp0, temp4); \ | |||
temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xff); \ | |||
temp0 = _mm_xor_si128(temp0, temp1) | |||
#define BLOCK2(IMM) \ | |||
temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ | |||
state->rkeys[idx++] = temp0; \ | |||
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x10); \ | |||
temp2 = _mm_xor_si128(temp2, temp4); \ | |||
temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x8c); \ | |||
temp2 = _mm_xor_si128(temp2, temp4); \ | |||
temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xaa); \ | |||
temp2 = _mm_xor_si128(temp2, temp1) | |||
BLOCK1(0x01); | |||
BLOCK2(0x01); | |||
BLOCK1(0x02); | |||
BLOCK2(0x02); | |||
BLOCK1(0x04); | |||
BLOCK2(0x04); | |||
BLOCK1(0x08); | |||
BLOCK2(0x08); | |||
BLOCK1(0x10); | |||
BLOCK2(0x10); | |||
BLOCK1(0x20); | |||
BLOCK2(0x20); | |||
BLOCK1(0x40); | |||
state->rkeys[idx++] = temp0; | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, | |||
size_t nblocks, | |||
aes256ctr_ctx *state) { | |||
size_t i; | |||
for (i = 0; i < nblocks; i++) { | |||
aesni_encrypt4(out, &state->n, state->rkeys); | |||
out += 64; | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t seed[32], | |||
uint64_t nonce) { | |||
unsigned int i; | |||
uint8_t buf[64]; | |||
aes256ctr_ctx state; | |||
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&state, seed, nonce); | |||
while (outlen >= 64) { | |||
aesni_encrypt4(out, &state.n, state.rkeys); | |||
outlen -= 64; | |||
out += 64; | |||
} | |||
if (outlen) { | |||
aesni_encrypt4(buf, &state.n, state.rkeys); | |||
for (i = 0; i < outlen; i++) { | |||
out[i] = buf[i]; | |||
} | |||
} | |||
} |
@@ -0,0 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_AES256CTR_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_AES256CTR_H | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define AES256CTR_BLOCKBYTES 64 | |||
typedef struct { | |||
__m128i rkeys[16]; | |||
__m128i n; | |||
} aes256ctr_ctx; | |||
void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, | |||
const uint8_t key[32], | |||
uint64_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, | |||
size_t nblocks, | |||
aes256ctr_ctx *state); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t seed[32], | |||
uint64_t nonce); | |||
#endif |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_ALIGN_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[N]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGNED_INT32(N) \ | |||
union { \ | |||
int32_t coeffs[N]; \ | |||
__m256i vec[((N)+7)/8]; \ | |||
} | |||
#endif |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_API_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_API_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES 1312 | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES 2544 | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES 2420 | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_ALGNAME "Dilithium2-AES" | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature( | |||
uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify( | |||
const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign( | |||
uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open( | |||
uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,24 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_CDECL_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CDECL_H | |||
#define _8XQ 0 | |||
#define _8XQINV 8 | |||
#define _8XDIV_QINV 16 | |||
#define _8XDIV 24 | |||
#define _ZETAS_QINV 32 | |||
#define _ZETAS 328 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
* C files (functions) can't be found, and all symbols we | |||
* refer to from ASM also can't be found (nttconsts.c). | |||
* | |||
* This define helps us get around this | |||
*/ | |||
#define _cdecl(s) _##s | |||
#define cdecl(s) s | |||
#endif |
@@ -0,0 +1,101 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define QINV 58728449 // q^(-1) mod 2^32 | |||
#define MONT (-4186625) // 2^32 mod q | |||
#define DIV 41978 // mont^2/256 | |||
#define DIV_QINV (-8395782) | |||
const qdata_t PQCLEAN_DILITHIUM2AES_AVX2_qdata = {{ | |||
//#define _8XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, | |||
//#define _8XQINV 8 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
//#define _8XDIV_QINV 16 | |||
DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, | |||
//#define _8XDIV 24 | |||
DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, | |||
//#define _ZETAS_QINV 32 | |||
-151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, | |||
308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, | |||
-1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, | |||
-1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, | |||
-285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, | |||
1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, | |||
1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, | |||
1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, | |||
329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, | |||
-1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, | |||
-202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, | |||
-1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, | |||
1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, | |||
-1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, | |||
-783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, | |||
1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, | |||
-695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, | |||
-654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, | |||
-247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, | |||
-916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, | |||
1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, | |||
-898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, | |||
2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, | |||
831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, | |||
-2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, | |||
991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, | |||
908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, | |||
-1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, | |||
6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, | |||
1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, | |||
-1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, | |||
1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, | |||
702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, | |||
746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, | |||
885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, | |||
1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, | |||
-1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, | |||
//#define _ZETAS 328 | |||
-3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, | |||
1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, | |||
-359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, | |||
3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, | |||
3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, | |||
2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, | |||
-549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, | |||
-2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, | |||
1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, | |||
2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, | |||
-3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, | |||
-1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, | |||
811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, | |||
-3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, | |||
3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, | |||
-3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, | |||
189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, | |||
-1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, | |||
-983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, | |||
264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, | |||
-3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, | |||
2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, | |||
342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, | |||
-1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, | |||
-3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, | |||
3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, | |||
286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, | |||
1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, | |||
3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, | |||
2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, | |||
-2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, | |||
-2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, | |||
3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, | |||
3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, | |||
4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, | |||
-1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, | |||
269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, | |||
} | |||
}; |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_CONSTS_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CONSTS_H | |||
#include "align.h" | |||
#include "cdecl.h" | |||
typedef ALIGNED_INT32(624) qdata_t; | |||
extern const qdata_t PQCLEAN_DILITHIUM2AES_AVX2_qdata; | |||
#endif |
@@ -0,0 +1,240 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpsubd %ymm\l,%ymm\h,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vpmuldq %ymm\zl0,%ymm12,%ymm13 | |||
vmovshdup %ymm12,%ymm\h | |||
vpmuldq %ymm\zl1,%ymm\h,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm12,%ymm12 | |||
vpmuldq %ymm\zh1,%ymm\h,%ymm\h | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpsubd %ymm13,%ymm12,%ymm12 | |||
vpsubd %ymm14,%ymm\h,%ymm\h | |||
vmovshdup %ymm12,%ymm12 | |||
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h | |||
.endm | |||
.macro levels0t5 off | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
/* level 0 */ | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,5,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 6,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,9,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 10,11,1,3,2,15 | |||
/* level 1 */ | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,6,1,3,2,15 | |||
butterfly 5,7,1,3,2,15 | |||
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 8,10,1,3,2,15 | |||
butterfly 9,11,1,3,2,15 | |||
/* level 2 */ | |||
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 | |||
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 | |||
vmovshdup %ymm3,%ymm1 | |||
vmovshdup %ymm15,%ymm2 | |||
butterfly 4,8,1,3,2,15 | |||
butterfly 5,9,1,3,2,15 | |||
butterfly 6,10,1,3,2,15 | |||
butterfly 7,11,1,3,2,15 | |||
/* level 3 */ | |||
shuffle2 4,5,3,5 | |||
shuffle2 6,7,4,7 | |||
shuffle2 8,9,6,9 | |||
shuffle2 10,11,8,11 | |||
vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 4,7 | |||
butterfly 6,9 | |||
butterfly 8,11 | |||
/* level 4 */ | |||
shuffle4 3,4,10,4 | |||
shuffle4 6,8,3,8 | |||
shuffle4 5,7,6,7 | |||
shuffle4 9,11,5,11 | |||
vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 | |||
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 | |||
butterfly 10,4 | |||
butterfly 3,8 | |||
butterfly 6,7 | |||
butterfly 5,11 | |||
/* level 5 */ | |||
shuffle8 10,3,9,3 | |||
shuffle8 6,5,10,5 | |||
shuffle8 4,8,6,8 | |||
shuffle8 7,11,4,11 | |||
vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 | |||
butterfly 9,3 | |||
butterfly 10,5 | |||
butterfly 6,8 | |||
butterfly 4,11 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm10,256*\off+ 32(%rdi) | |||
vmovdqa %ymm6,256*\off+ 64(%rdi) | |||
vmovdqa %ymm4,256*\off+ 96(%rdi) | |||
vmovdqa %ymm3,256*\off+128(%rdi) | |||
vmovdqa %ymm5,256*\off+160(%rdi) | |||
vmovdqa %ymm8,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
.macro levels6t7 off | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
/* level 6 */ | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
/* level 7 */ | |||
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 | |||
vmovdqa (_8XDIV)*4(%rsi),%ymm2 | |||
vpmuldq %ymm1,%ymm4,%ymm12 | |||
vpmuldq %ymm1,%ymm5,%ymm13 | |||
vmovshdup %ymm4,%ymm8 | |||
vmovshdup %ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm4,%ymm4 | |||
vpmuldq %ymm2,%ymm5,%ymm5 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm4,%ymm4 | |||
vpsubd %ymm13,%ymm5,%ymm5 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm4,%ymm4 | |||
vmovshdup %ymm5,%ymm5 | |||
vpblendd $0xAA,%ymm8,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm9,%ymm5,%ymm5 | |||
vpmuldq %ymm1,%ymm6,%ymm12 | |||
vpmuldq %ymm1,%ymm7,%ymm13 | |||
vmovshdup %ymm6,%ymm8 | |||
vmovshdup %ymm7,%ymm9 | |||
vpmuldq %ymm1,%ymm8,%ymm14 | |||
vpmuldq %ymm1,%ymm9,%ymm15 | |||
vpmuldq %ymm2,%ymm6,%ymm6 | |||
vpmuldq %ymm2,%ymm7,%ymm7 | |||
vpmuldq %ymm2,%ymm8,%ymm8 | |||
vpmuldq %ymm2,%ymm9,%ymm9 | |||
vpmuldq %ymm0,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vpmuldq %ymm0,%ymm15,%ymm15 | |||
vpsubd %ymm12,%ymm6,%ymm6 | |||
vpsubd %ymm13,%ymm7,%ymm7 | |||
vpsubd %ymm14,%ymm8,%ymm8 | |||
vpsubd %ymm15,%ymm9,%ymm9 | |||
vmovshdup %ymm6,%ymm6 | |||
vmovshdup %ymm7,%ymm7 | |||
vpblendd $0xAA,%ymm8,%ymm6,%ymm6 | |||
vpblendd $0xAA,%ymm9,%ymm7,%ymm7 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t5 0 | |||
levels0t5 1 | |||
levels0t5 2 | |||
levels0t5 3 | |||
levels6t7 0 | |||
levels6t7 1 | |||
levels6t7 2 | |||
levels6t7 3 | |||
ret |
@@ -0,0 +1,199 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 | |||
vpmuldq %ymm\zl0,%ymm\h,%ymm13 | |||
vmovshdup %ymm\h,%ymm12 | |||
vpmuldq %ymm\zl1,%ymm12,%ymm14 | |||
vpmuldq %ymm\zh0,%ymm\h,%ymm\h | |||
vpmuldq %ymm\zh1,%ymm12,%ymm12 | |||
vpmuldq %ymm0,%ymm13,%ymm13 | |||
vpmuldq %ymm0,%ymm14,%ymm14 | |||
vmovshdup %ymm\h,%ymm\h | |||
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h | |||
vpsubd %ymm\h,%ymm\l,%ymm12 | |||
vpaddd %ymm\h,%ymm\l,%ymm\l | |||
vmovshdup %ymm13,%ymm13 | |||
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 | |||
vpaddd %ymm13,%ymm12,%ymm\h | |||
vpsubd %ymm13,%ymm\l,%ymm\l | |||
.endm | |||
.macro levels0t1 off | |||
/* level 0 */ | |||
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 | |||
vmovdqa 0+32*\off(%rdi),%ymm4 | |||
vmovdqa 128+32*\off(%rdi),%ymm5 | |||
vmovdqa 256+32*\off(%rdi),%ymm6 | |||
vmovdqa 384+32*\off(%rdi),%ymm7 | |||
vmovdqa 512+32*\off(%rdi),%ymm8 | |||
vmovdqa 640+32*\off(%rdi),%ymm9 | |||
vmovdqa 768+32*\off(%rdi),%ymm10 | |||
vmovdqa 896+32*\off(%rdi),%ymm11 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
/* level 1 */ | |||
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 | |||
butterfly 4,6 | |||
butterfly 5,7 | |||
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 | |||
butterfly 8,10 | |||
butterfly 9,11 | |||
vmovdqa %ymm4, 0+32*\off(%rdi) | |||
vmovdqa %ymm5,128+32*\off(%rdi) | |||
vmovdqa %ymm6,256+32*\off(%rdi) | |||
vmovdqa %ymm7,384+32*\off(%rdi) | |||
vmovdqa %ymm8,512+32*\off(%rdi) | |||
vmovdqa %ymm9,640+32*\off(%rdi) | |||
vmovdqa %ymm10,768+32*\off(%rdi) | |||
vmovdqa %ymm11,896+32*\off(%rdi) | |||
.endm | |||
.macro levels2t7 off | |||
/* level 2 */ | |||
vmovdqa 256*\off+ 0(%rdi),%ymm4 | |||
vmovdqa 256*\off+ 32(%rdi),%ymm5 | |||
vmovdqa 256*\off+ 64(%rdi),%ymm6 | |||
vmovdqa 256*\off+ 96(%rdi),%ymm7 | |||
vmovdqa 256*\off+128(%rdi),%ymm8 | |||
vmovdqa 256*\off+160(%rdi),%ymm9 | |||
vmovdqa 256*\off+192(%rdi),%ymm10 | |||
vmovdqa 256*\off+224(%rdi),%ymm11 | |||
vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 | |||
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 | |||
butterfly 4,8 | |||
butterfly 5,9 | |||
butterfly 6,10 | |||
butterfly 7,11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
/* level 3 */ | |||
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 | |||
butterfly 3,5 | |||
butterfly 8,10 | |||
butterfly 4,6 | |||
butterfly 9,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
/* level 4 */ | |||
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 | |||
butterfly 7,8 | |||
butterfly 5,6 | |||
butterfly 3,4 | |||
butterfly 10,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
/* level 5 */ | |||
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,5,1,10,2,15 | |||
butterfly 8,4,1,10,2,15 | |||
butterfly 7,3,1,10,2,15 | |||
butterfly 6,11,1,10,2,15 | |||
/* level 6 */ | |||
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,7,1,10,2,15 | |||
butterfly 8,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,3,1,10,2,15 | |||
butterfly 4,11,1,10,2,15 | |||
/* level 7 */ | |||
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 9,8,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 7,6,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 5,4,1,10,2,15 | |||
vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 | |||
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 | |||
vpsrlq $32,%ymm1,%ymm10 | |||
vmovshdup %ymm2,%ymm15 | |||
butterfly 3,11,1,10,2,15 | |||
vmovdqa %ymm9,256*\off+ 0(%rdi) | |||
vmovdqa %ymm8,256*\off+ 32(%rdi) | |||
vmovdqa %ymm7,256*\off+ 64(%rdi) | |||
vmovdqa %ymm6,256*\off+ 96(%rdi) | |||
vmovdqa %ymm5,256*\off+128(%rdi) | |||
vmovdqa %ymm4,256*\off+160(%rdi) | |||
vmovdqa %ymm3,256*\off+192(%rdi) | |||
vmovdqa %ymm11,256*\off+224(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx): | |||
vmovdqa _8XQ*4(%rsi),%ymm0 | |||
levels0t1 0 | |||
levels0t1 1 | |||
levels0t1 2 | |||
levels0t1 3 | |||
levels2t7 0 | |||
levels2t7 1 | |||
levels2t7 2 | |||
levels2t7 3 | |||
ret | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_NTT_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_NTT_H | |||
#include <immintrin.h> | |||
void PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx(__m256i *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); | |||
#endif |
@@ -0,0 +1,261 @@ | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_pk | |||
* | |||
* Description: Bit-pack public key pk = (rho, t1). | |||
* | |||
* Arguments: - uint8_t pk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const polyveck *t1: pointer to vector t1 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const polyveck *t1) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
pk[i] = rho[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk | |||
* | |||
* Description: Unpack public key pk = (rho, t1). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const polyveck *t1: pointer to output vector t1 | |||
* - uint8_t pk[]: byte array containing bit-packed pk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], | |||
polyveck *t1, | |||
const uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = pk[i]; | |||
} | |||
pk += SEEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_sk | |||
* | |||
* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - uint8_t sk[]: output byte array | |||
* - const uint8_t rho[]: byte array containing rho | |||
* - const uint8_t tr[]: byte array containing tr | |||
* - const uint8_t key[]: byte array containing key | |||
* - const polyveck *t0: pointer to vector t0 | |||
* - const polyvecl *s1: pointer to vector s1 | |||
* - const polyveck *s2: pointer to vector s2 | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = rho[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sk[i] = key[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
sk[i] = tr[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk | |||
* | |||
* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). | |||
* | |||
* Arguments: - const uint8_t rho[]: output byte array for rho | |||
* - const uint8_t tr[]: output byte array for tr | |||
* - const uint8_t key[]: output byte array for key | |||
* - const polyveck *t0: pointer to output vector t0 | |||
* - const polyvecl *s1: pointer to output vector s1 | |||
* - const polyveck *s2: pointer to output vector s2 | |||
* - uint8_t sk[]: byte array containing bit-packed sk | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
rho[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
key[i] = sk[i]; | |||
} | |||
sk += SEEDBYTES; | |||
for (i = 0; i < CRHBYTES; ++i) { | |||
tr[i] = sk[i]; | |||
} | |||
sk += CRHBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += L * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); | |||
} | |||
sk += K * POLYETA_PACKEDBYTES; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_sig | |||
* | |||
* Description: Bit-pack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t sig[]: output byte array | |||
* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2AES_AVX2_challenge hash length SEEDBYTES | |||
* - const polyvecl *z: pointer to vector z | |||
* - const polyveck *h: pointer to hint vector h | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES], | |||
const uint8_t c[SEEDBYTES], | |||
const polyvecl *z, | |||
const polyveck *h) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
sig[i] = c[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Encode h */ | |||
for (i = 0; i < OMEGA + K; ++i) { | |||
sig[i] = 0; | |||
} | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
if (h->vec[i].coeffs[j] != 0) { | |||
sig[k++] = (uint8_t) j; | |||
} | |||
} | |||
sig[OMEGA + i] = (uint8_t) k; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig | |||
* | |||
* Description: Unpack signature sig = (c, z, h). | |||
* | |||
* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2AES_AVX2_challenge hash | |||
* - polyvecl *z: pointer to output vector z | |||
* - polyveck *h: pointer to output hint vector h | |||
* - const uint8_t sig[]: byte array containing | |||
* bit-packed signature | |||
* | |||
* Returns 1 in case of malformed signature; otherwise 0. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], | |||
polyvecl *z, | |||
polyveck *h, | |||
const uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES]) { | |||
unsigned int i, j, k; | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
c[i] = sig[i]; | |||
} | |||
sig += SEEDBYTES; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); | |||
} | |||
sig += L * POLYZ_PACKEDBYTES; | |||
/* Decode h */ | |||
k = 0; | |||
for (i = 0; i < K; ++i) { | |||
for (j = 0; j < N; ++j) { | |||
h->vec[i].coeffs[j] = 0; | |||
} | |||
if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { | |||
return 1; | |||
} | |||
for (j = k; j < sig[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > k && sig[j] <= sig[j - 1]) { | |||
return 1; | |||
} | |||
h->vec[i].coeffs[sig[j]] = 1; | |||
} | |||
k = sig[OMEGA + i]; | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = k; j < OMEGA; ++j) { | |||
if (sig[j]) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,31 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_PACKING_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_PACKING_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES], | |||
const uint8_t rho[SEEDBYTES], | |||
const uint8_t tr[CRHBYTES], | |||
const uint8_t key[SEEDBYTES], | |||
const polyveck *t0, | |||
const polyvecl *s1, | |||
const polyveck *s2); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], | |||
uint8_t tr[CRHBYTES], | |||
uint8_t key[SEEDBYTES], | |||
polyveck *t0, | |||
polyvecl *s1, | |||
polyveck *s2, | |||
const uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES]); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES]); | |||
#endif |
@@ -0,0 +1,41 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_PARAMS_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_PARAMS_H | |||
#define SEEDBYTES 32 | |||
#define CRHBYTES 48 | |||
#define N 256 | |||
#define Q 8380417 | |||
#define D 13 | |||
#define ROOT_OF_UNITY 1753 | |||
#define K 4 | |||
#define L 4 | |||
#define ETA 2 | |||
#define TAU 39 | |||
#define BETA 78 | |||
#define GAMMA1 (1 << 17) | |||
#define GAMMA2 ((Q-1)/88) | |||
#define OMEGA 80 | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_ALGNAME "Dilithium2-AES" | |||
#define POLYT1_PACKEDBYTES 320 | |||
#define POLYT0_PACKEDBYTES 416 | |||
#define POLYVECH_PACKEDBYTES (OMEGA + K) | |||
#define POLYZ_PACKEDBYTES 576 | |||
#define POLYW1_PACKEDBYTES 192 | |||
#define POLYETA_PACKEDBYTES 96 | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ | |||
+ L*POLYETA_PACKEDBYTES \ | |||
+ K*POLYETA_PACKEDBYTES \ | |||
+ K*POLYT0_PACKEDBYTES) | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) | |||
#endif |
@@ -0,0 +1,199 @@ | |||
#include "params.h" | |||
#include "cdecl.h" | |||
.text | |||
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
#load | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
vmovdqa 64(%rsi),%ymm6 | |||
vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vmovdqa 64(%rdx),%ymm14 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vmovshdup %ymm6,%ymm7 | |||
vpsrlq $32,%ymm10,%ymm11 | |||
vpsrlq $32,%ymm12,%ymm13 | |||
vmovshdup %ymm14,%ymm15 | |||
#mul | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
vpmuldq %ymm6,%ymm14,%ymm6 | |||
vpmuldq %ymm7,%ymm15,%ymm7 | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm0,%ymm6,%ymm14 | |||
vpmuldq %ymm0,%ymm7,%ymm15 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpmuldq %ymm1,%ymm14,%ymm14 | |||
vpmuldq %ymm1,%ymm15,%ymm15 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsubq %ymm14,%ymm6,%ymm6 | |||
vpsubq %ymm15,%ymm7,%ymm7 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vpsrlq $32,%ymm4,%ymm4 | |||
vmovshdup %ymm6,%ymm6 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
vpblendd $0xAA,%ymm5,%ymm4,%ymm4 | |||
vpblendd $0xAA,%ymm7,%ymm6,%ymm6 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
add $96,%rdi | |||
add $96,%rsi | |||
add $96,%rdx | |||
add $1,%eax | |||
cmp $10,%eax | |||
jb _looptop1 | |||
vmovdqa (%rsi),%ymm2 | |||
vmovdqa 32(%rsi),%ymm4 | |||
vmovdqa (%rdx),%ymm10 | |||
vmovdqa 32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm2,%ymm3 | |||
vpsrlq $32,%ymm4,%ymm5 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuldq %ymm2,%ymm10,%ymm2 | |||
vpmuldq %ymm3,%ymm11,%ymm3 | |||
vpmuldq %ymm4,%ymm12,%ymm4 | |||
vpmuldq %ymm5,%ymm13,%ymm5 | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm10 | |||
vpmuldq %ymm0,%ymm3,%ymm11 | |||
vpmuldq %ymm0,%ymm4,%ymm12 | |||
vpmuldq %ymm0,%ymm5,%ymm13 | |||
vpmuldq %ymm1,%ymm10,%ymm10 | |||
vpmuldq %ymm1,%ymm11,%ymm11 | |||
vpmuldq %ymm1,%ymm12,%ymm12 | |||
vpmuldq %ymm1,%ymm13,%ymm13 | |||
vpsubq %ymm10,%ymm2,%ymm2 | |||
vpsubq %ymm11,%ymm3,%ymm3 | |||
vpsubq %ymm12,%ymm4,%ymm4 | |||
vpsubq %ymm13,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0x55,%ymm2,%ymm3,%ymm2 | |||
vpblendd $0x55,%ymm4,%ymm5,%ymm4 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
ret | |||
.macro pointwise off | |||
#load | |||
vmovdqa \off(%rsi),%ymm6 | |||
vmovdqa \off+32(%rsi),%ymm8 | |||
vmovdqa \off(%rdx),%ymm10 | |||
vmovdqa \off+32(%rdx),%ymm12 | |||
vpsrlq $32,%ymm6,%ymm7 | |||
vpsrlq $32,%ymm8,%ymm9 | |||
vmovshdup %ymm10,%ymm11 | |||
vmovshdup %ymm12,%ymm13 | |||
#mul | |||
vpmuldq %ymm6,%ymm10,%ymm6 | |||
vpmuldq %ymm7,%ymm11,%ymm7 | |||
vpmuldq %ymm8,%ymm12,%ymm8 | |||
vpmuldq %ymm9,%ymm13,%ymm9 | |||
.endm | |||
.macro acc | |||
vpaddq %ymm6,%ymm2,%ymm2 | |||
vpaddq %ymm7,%ymm3,%ymm3 | |||
vpaddq %ymm8,%ymm4,%ymm4 | |||
vpaddq %ymm9,%ymm5,%ymm5 | |||
.endm | |||
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa _8XQINV*4(%rcx),%ymm0 | |||
vmovdqa _8XQ*4(%rcx),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
pointwise 0 | |||
#mov | |||
vmovdqa %ymm6,%ymm2 | |||
vmovdqa %ymm7,%ymm3 | |||
vmovdqa %ymm8,%ymm4 | |||
vmovdqa %ymm9,%ymm5 | |||
pointwise 1024 | |||
acc | |||
pointwise 2048 | |||
acc | |||
pointwise 3072 | |||
acc | |||
#reduce | |||
vpmuldq %ymm0,%ymm2,%ymm6 | |||
vpmuldq %ymm0,%ymm3,%ymm7 | |||
vpmuldq %ymm0,%ymm4,%ymm8 | |||
vpmuldq %ymm0,%ymm5,%ymm9 | |||
vpmuldq %ymm1,%ymm6,%ymm6 | |||
vpmuldq %ymm1,%ymm7,%ymm7 | |||
vpmuldq %ymm1,%ymm8,%ymm8 | |||
vpmuldq %ymm1,%ymm9,%ymm9 | |||
vpsubq %ymm6,%ymm2,%ymm2 | |||
vpsubq %ymm7,%ymm3,%ymm3 | |||
vpsubq %ymm8,%ymm4,%ymm4 | |||
vpsubq %ymm9,%ymm5,%ymm5 | |||
vpsrlq $32,%ymm2,%ymm2 | |||
vmovshdup %ymm4,%ymm4 | |||
#store | |||
vpblendd $0xAA,%ymm3,%ymm2,%ymm2 | |||
vpblendd $0xAA,%ymm5,%ymm4,%ymm4 | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
add $64,%rsi | |||
add $64,%rdx | |||
add $64,%rdi | |||
add $1,%eax | |||
cmp $16,%eax | |||
jb _looptop2 | |||
ret |
@@ -0,0 +1,891 @@ | |||
#include "align.h" | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "rejsample.h" | |||
#include "rounding.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define DBENCH_START() | |||
#define DBENCH_STOP(t) | |||
#define _mm256_blendv_epi32(a,b,mask) \ | |||
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ | |||
_mm256_castsi256_ps(b), \ | |||
_mm256_castsi256_ps(mask))) | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce | |||
* | |||
* Description: Inplace reduction of all coefficients of polynomial to | |||
* representative in [-6283009,6283007]. Assumes input | |||
* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(poly *a) { | |||
unsigned int i; | |||
__m256i f, g; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]); | |||
const __m256i off = _mm256_set1_epi32(1 << 22); | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a->vec[i]); | |||
g = _mm256_add_epi32(f, off); | |||
g = _mm256_srai_epi32(g, 23); | |||
g = _mm256_mullo_epi32(g, q); | |||
f = _mm256_sub_epi32(f, g); | |||
_mm256_store_si256(&a->vec[i], f); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: poly_addq | |||
* | |||
* Description: For all coefficients of in/out polynomial add Q if | |||
* coefficient is negative. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(poly *a) { | |||
unsigned int i; | |||
__m256i f, g; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]); | |||
const __m256i zero = _mm256_setzero_si256(); | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a->vec[i]); | |||
g = _mm256_blendv_epi32(zero, q, f); | |||
f = _mm256_add_epi32(f, g); | |||
_mm256_store_si256(&a->vec[i], f); | |||
} | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze | |||
* | |||
* Description: Inplace reduction of all coefficients of polynomial to | |||
* positive standard representatives. Assumes input | |||
* coefficients to be at most 2^31 - 2^22 + 1 in | |||
* absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(a); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(a); | |||
DBENCH_STOP(*tred); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_add | |||
* | |||
* Description: Add polynomials. No modular reduction is performed. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first summand | |||
* - const poly *b: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_add(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
__m256i f, g; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a->vec[i]); | |||
g = _mm256_load_si256(&b->vec[i]); | |||
f = _mm256_add_epi32(f, g); | |||
_mm256_store_si256(&c->vec[i], f); | |||
} | |||
DBENCH_STOP(*tadd); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_sub | |||
* | |||
* Description: Subtract polynomials. No modular reduction is | |||
* performed. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial to be | |||
* subtraced from first input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { | |||
unsigned int i; | |||
__m256i f, g; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a->vec[i]); | |||
g = _mm256_load_si256(&b->vec[i]); | |||
f = _mm256_sub_epi32(f, g); | |||
_mm256_store_si256(&c->vec[i], f); | |||
} | |||
DBENCH_STOP(*tadd); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl | |||
* | |||
* Description: Multiply polynomial by 2^D without modular reduction. Assumes | |||
* input coefficients to be less than 2^{31-D} in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(poly *a) { | |||
unsigned int i; | |||
__m256i f; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a->vec[i]); | |||
f = _mm256_slli_epi32(f, D); | |||
_mm256_store_si256(&a->vec[i], f); | |||
} | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt | |||
* | |||
* Description: Inplace forward NTT. Coefficients can grow by up to | |||
* 8*Q in absolute value. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont | |||
* | |||
* Description: Inplace inverse NTT and multiplication by 2^{32}. | |||
* Input coefficients need to be less than Q in absolute | |||
* value and output coefficients are again bounded by Q. | |||
* | |||
* Arguments: - poly *a: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); | |||
DBENCH_STOP(*tmul); | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx(a->vec); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery | |||
* | |||
* Description: Pointwise multiplication of polynomials in NTT domain | |||
* representation and multiplication of resulting polynomial | |||
* by 2^{-32}. | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); | |||
DBENCH_STOP(*tmul); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round | |||
* | |||
* Description: For all coefficients c of the input polynomial, | |||
* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 | |||
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be | |||
* positive standard representatives. | |||
* | |||
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 | |||
* - poly *a0: pointer to output polynomial with coefficients c0 | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose | |||
* | |||
* Description: For all coefficients c of the input polynomial, | |||
* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 | |||
* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we | |||
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. | |||
* Assumes coefficients to be positive standard representatives. | |||
* | |||
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 | |||
* - poly *a0: pointer to output polynomial with coefficients c0 | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint | |||
* | |||
* Description: Compute hint array. The coefficients of which are the | |||
* indices of the coefficients of the input polynomial | |||
* whose low bits overflow into the high bits. | |||
* | |||
* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) | |||
* - const poly *a0: pointer to low part of input polynomial | |||
* - const poly *a1: pointer to high part of input polynomial | |||
* | |||
* Returns number of hints, i.e. length of hint array. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { | |||
unsigned int r; | |||
DBENCH_START(); | |||
r = PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(hint, a0->vec, a1->vec); | |||
DBENCH_STOP(*tround); | |||
return r; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint | |||
* | |||
* Description: Use hint polynomial to correct the high bits of a polynomial. | |||
* | |||
* Arguments: - poly *b: pointer to output polynomial with corrected high bits | |||
* - const poly *a: pointer to input polynomial | |||
* - const poly *h: pointer to input hint polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { | |||
DBENCH_START(); | |||
PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(b->vec, a->vec, h->vec); | |||
DBENCH_STOP(*tround); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm | |||
* | |||
* Description: Check infinity norm of polynomial against given bound. | |||
* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(). | |||
* | |||
* Arguments: - const poly *a: pointer to polynomial | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(const poly *a, int32_t B) { | |||
unsigned int i; | |||
int r; | |||
__m256i f, t; | |||
const __m256i bound = _mm256_set1_epi32(B - 1); | |||
DBENCH_START(); | |||
if (B > (Q - 1) / 8) { | |||
return 1; | |||
} | |||
t = _mm256_setzero_si256(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a->vec[i]); | |||
f = _mm256_abs_epi32(f); | |||
f = _mm256_cmpgt_epi32(f, bound); | |||
t = _mm256_or_si256(t, f); | |||
} | |||
r = 1 - _mm256_testz_si256(t, t); | |||
DBENCH_STOP(*tsample); | |||
return r; | |||
} | |||
/************************************************* | |||
* Name: rej_uniform | |||
* | |||
* Description: Sample uniformly random coefficients in [0, Q-1] by | |||
* performing rejection sampling on array of random bytes. | |||
* | |||
* Arguments: - int32_t *a: pointer to output array (allocated) | |||
* - unsigned int len: number of coefficients to be sampled | |||
* - const uint8_t *buf: array of random bytes | |||
* - unsigned int buflen: length of array of random bytes | |||
* | |||
* Returns number of sampled coefficients. Can be smaller than len if not enough | |||
* random bytes were given. | |||
**************************************************/ | |||
static unsigned int rej_uniform(int32_t *a, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint32_t t; | |||
DBENCH_START(); | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
t = buf[pos++]; | |||
t |= (uint32_t)buf[pos++] << 8; | |||
t |= (uint32_t)buf[pos++] << 16; | |||
t &= 0x7FFFFF; | |||
if (t < Q) { | |||
a[ctr++] = t; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return ctr; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [0,Q-1] by performing rejection sampling on the | |||
* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES | |||
* - uint16_t nonce: 2-byte nonce | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { | |||
unsigned int ctr; | |||
/* PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx reads up to 8 additional bytes */ | |||
ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; | |||
stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); | |||
ctr = PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); | |||
while (ctr < N) { | |||
/* length of buf is always divisible by 3; hence, no bytes left */ | |||
stream128_squeezeblocks(buf.coeffs, 1, state); | |||
ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
stream128_state state; | |||
stream128_init(&state, seed, nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(a, &state); | |||
stream128_release(&state); | |||
} | |||
/************************************************* | |||
* Name: rej_eta | |||
* | |||
* Description: Sample uniformly random coefficients in [-ETA, ETA] by | |||
* performing rejection sampling on array of random bytes. | |||
* | |||
* Arguments: - int32_t *a: pointer to output array (allocated) | |||
* - unsigned int len: number of coefficients to be sampled | |||
* - const uint8_t *buf: array of random bytes | |||
* - unsigned int buflen: length of array of random bytes | |||
* | |||
* Returns number of sampled coefficients. Can be smaller than len if not enough | |||
* random bytes were given. | |||
**************************************************/ | |||
static unsigned int rej_eta(int32_t *a, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint32_t t0, t1; | |||
DBENCH_START(); | |||
ctr = pos = 0; | |||
while (ctr < len && pos < buflen) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (t0 < 15) { | |||
t0 = t0 - (205 * t0 >> 10) * 5; | |||
a[ctr++] = 2 - t0; | |||
} | |||
if (t1 < 15 && ctr < len) { | |||
t1 = t1 - (205 * t1 >> 10) * 5; | |||
a[ctr++] = 2 - t1; | |||
} | |||
} | |||
DBENCH_STOP(*tsample); | |||
return ctr; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [-ETA,ETA] by performing rejection sampling using the | |||
* output stream of SHAKE256(seed|nonce) | |||
* or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length SEEDBYTES | |||
* - uint16_t nonce: 2-byte nonce | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { | |||
unsigned int ctr; | |||
ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; | |||
stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); | |||
ctr = PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); | |||
while (ctr < N) { | |||
stream128_squeezeblocks(buf.coeffs, 1, state); | |||
ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
stream128_state state; | |||
stream128_init(&state, seed, nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(a, &state); | |||
stream128_release(&state); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1 | |||
* | |||
* Description: Sample polynomial with uniformly random coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream | |||
* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). | |||
* | |||
* Arguments: - poly *a: pointer to output polynomial | |||
* - const uint8_t seed[]: byte array with seed of length CRHBYTES | |||
* - uint16_t nonce: 16-bit nonce | |||
**************************************************/ | |||
#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { | |||
/* PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack reads 14 additional bytes */ | |||
ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; | |||
stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(a, buf.coeffs); | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { | |||
stream256_state state; | |||
stream256_init(&state, seed, nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(a, &state); | |||
stream256_release(&state); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_challenge | |||
* | |||
* Description: Implementation of H. Samples polynomial with TAU nonzero | |||
* coefficients in {-1,1} using the output stream of | |||
* SHAKE256(seed). | |||
* | |||
* Arguments: - poly *c: pointer to output polynomial | |||
* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { | |||
unsigned int i, b, pos; | |||
uint64_t signs; | |||
ALIGNED_UINT8(SHAKE256_RATE) buf; | |||
shake256incctx state; | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, seed, SEEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); | |||
memcpy(&signs, buf.coeffs, 8); | |||
pos = 8; | |||
memset(c->vec, 0, sizeof(poly)); | |||
for (i = N - TAU; i < N; ++i) { | |||
do { | |||
if (pos >= SHAKE256_RATE) { | |||
shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); | |||
pos = 0; | |||
} | |||
b = buf.coeffs[pos++]; | |||
} while (b > i); | |||
c->coeffs[i] = c->coeffs[b]; | |||
c->coeffs[b] = 1 - 2 * (signs & 1); | |||
signs >>= 1; | |||
} | |||
shake256_inc_ctx_release(&state); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack | |||
* | |||
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYETA_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { | |||
unsigned int i; | |||
uint8_t t[8]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
t[0] = ETA - a->coeffs[8 * i + 0]; | |||
t[1] = ETA - a->coeffs[8 * i + 1]; | |||
t[2] = ETA - a->coeffs[8 * i + 2]; | |||
t[3] = ETA - a->coeffs[8 * i + 3]; | |||
t[4] = ETA - a->coeffs[8 * i + 4]; | |||
t[5] = ETA - a->coeffs[8 * i + 5]; | |||
t[6] = ETA - a->coeffs[8 * i + 6]; | |||
t[7] = ETA - a->coeffs[8 * i + 7]; | |||
r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); | |||
r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); | |||
r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack | |||
* | |||
* Description: Unpack polynomial with coefficients in [-ETA,ETA]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; | |||
r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; | |||
r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; | |||
r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; | |||
r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; | |||
r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; | |||
r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; | |||
r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; | |||
r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; | |||
r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; | |||
r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; | |||
r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; | |||
r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; | |||
r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; | |||
r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; | |||
r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack | |||
* | |||
* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. | |||
* Input coefficients are assumed to be positive standard representatives. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYT1_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); | |||
r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); | |||
r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); | |||
r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); | |||
r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack | |||
* | |||
* Description: Unpack polynomial t1 with 10-bit coefficients. | |||
* Output coefficients are positive standard representatives. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; | |||
r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; | |||
r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; | |||
r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack | |||
* | |||
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYT0_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { | |||
unsigned int i; | |||
uint32_t t[8]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; | |||
t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; | |||
t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; | |||
t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; | |||
t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; | |||
t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; | |||
t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; | |||
t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; | |||
r[13 * i + 0] = t[0]; | |||
r[13 * i + 1] = t[0] >> 8; | |||
r[13 * i + 1] |= t[1] << 5; | |||
r[13 * i + 2] = t[1] >> 3; | |||
r[13 * i + 3] = t[1] >> 11; | |||
r[13 * i + 3] |= t[2] << 2; | |||
r[13 * i + 4] = t[2] >> 6; | |||
r[13 * i + 4] |= t[3] << 7; | |||
r[13 * i + 5] = t[3] >> 1; | |||
r[13 * i + 6] = t[3] >> 9; | |||
r[13 * i + 6] |= t[4] << 4; | |||
r[13 * i + 7] = t[4] >> 4; | |||
r[13 * i + 8] = t[4] >> 12; | |||
r[13 * i + 8] |= t[5] << 1; | |||
r[13 * i + 9] = t[5] >> 7; | |||
r[13 * i + 9] |= t[6] << 6; | |||
r[13 * i + 10] = t[6] >> 2; | |||
r[13 * i + 11] = t[6] >> 10; | |||
r[13 * i + 11] |= t[7] << 3; | |||
r[13 * i + 12] = t[7] >> 5; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack | |||
* | |||
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { | |||
unsigned int i; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; ++i) { | |||
r->coeffs[8 * i + 0] = a[13 * i + 0]; | |||
r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; | |||
r->coeffs[8 * i + 0] &= 0x1FFF; | |||
r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; | |||
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; | |||
r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; | |||
r->coeffs[8 * i + 1] &= 0x1FFF; | |||
r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; | |||
r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; | |||
r->coeffs[8 * i + 2] &= 0x1FFF; | |||
r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; | |||
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; | |||
r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; | |||
r->coeffs[8 * i + 3] &= 0x1FFF; | |||
r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; | |||
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; | |||
r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; | |||
r->coeffs[8 * i + 4] &= 0x1FFF; | |||
r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; | |||
r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; | |||
r->coeffs[8 * i + 5] &= 0x1FFF; | |||
r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; | |||
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; | |||
r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; | |||
r->coeffs[8 * i + 6] &= 0x1FFF; | |||
r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; | |||
r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; | |||
r->coeffs[8 * i + 7] &= 0x1FFF; | |||
r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; | |||
r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; | |||
r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; | |||
r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; | |||
r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; | |||
r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; | |||
r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; | |||
r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack | |||
* | |||
* Description: Bit-pack polynomial with coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1]. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYZ_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { | |||
unsigned int i; | |||
uint32_t t[4]; | |||
DBENCH_START(); | |||
for (i = 0; i < N / 4; ++i) { | |||
t[0] = GAMMA1 - a->coeffs[4 * i + 0]; | |||
t[1] = GAMMA1 - a->coeffs[4 * i + 1]; | |||
t[2] = GAMMA1 - a->coeffs[4 * i + 2]; | |||
t[3] = GAMMA1 - a->coeffs[4 * i + 3]; | |||
r[9 * i + 0] = t[0]; | |||
r[9 * i + 1] = t[0] >> 8; | |||
r[9 * i + 2] = t[0] >> 16; | |||
r[9 * i + 2] |= t[1] << 2; | |||
r[9 * i + 3] = t[1] >> 6; | |||
r[9 * i + 4] = t[1] >> 14; | |||
r[9 * i + 4] |= t[2] << 4; | |||
r[9 * i + 5] = t[2] >> 4; | |||
r[9 * i + 6] = t[2] >> 12; | |||
r[9 * i + 6] |= t[3] << 6; | |||
r[9 * i + 7] = t[3] >> 2; | |||
r[9 * i + 8] = t[3] >> 10; | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack | |||
* | |||
* Description: Unpack polynomial z with coefficients | |||
* in [-(GAMMA1 - 1), GAMMA1]. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: byte array with bit-packed polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 14]) { | |||
unsigned int i; | |||
__m256i f; | |||
const __m256i shufbidx = _mm256_set_epi8(-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1, | |||
-1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0); | |||
const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0); | |||
const __m256i mask = _mm256_set1_epi32(0x3FFFF); | |||
const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); | |||
DBENCH_START(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_loadu_si256((__m256i *)&a[18 * i]); | |||
f = _mm256_permute4x64_epi64(f, 0x94); | |||
f = _mm256_shuffle_epi8(f, shufbidx); | |||
f = _mm256_srlv_epi32(f, srlvdidx); | |||
f = _mm256_and_si256(f, mask); | |||
f = _mm256_sub_epi32(gamma1, f); | |||
_mm256_store_si256(&r->vec[i], f); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack | |||
* | |||
* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. | |||
* Input coefficients are assumed to be positive standard representatives. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array with at least | |||
* POLYW1_PACKEDBYTES bytes | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *restrict a) { | |||
unsigned int i; | |||
__m256i f0, f1, f2, f3; | |||
const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1); | |||
const __m256i shift2 = _mm256_set1_epi32((4096 << 16) + 1); | |||
const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); | |||
const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0); | |||
const __m256i shufbidx = _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, | |||
-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); | |||
DBENCH_START(); | |||
for (i = 0; i < N / 32; i++) { | |||
f0 = _mm256_load_si256(&a->vec[4 * i + 0]); | |||
f1 = _mm256_load_si256(&a->vec[4 * i + 1]); | |||
f2 = _mm256_load_si256(&a->vec[4 * i + 2]); | |||
f3 = _mm256_load_si256(&a->vec[4 * i + 3]); | |||
f0 = _mm256_packus_epi32(f0, f1); | |||
f1 = _mm256_packus_epi32(f2, f3); | |||
f0 = _mm256_packus_epi16(f0, f1); | |||
f0 = _mm256_maddubs_epi16(f0, shift1); | |||
f0 = _mm256_madd_epi16(f0, shift2); | |||
f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1); | |||
f0 = _mm256_shuffle_epi8(f0, shufbidx); | |||
f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2); | |||
_mm256_storeu_si256((__m256i *)&r[24 * i], f0); | |||
} | |||
DBENCH_STOP(*tpack); | |||
} |
@@ -0,0 +1,52 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_POLY_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_POLY_H | |||
#include "align.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
typedef ALIGNED_INT32(N) poly; | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_add(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(const poly *a, int32_t B); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); | |||
#endif |
@@ -0,0 +1,449 @@ | |||
#include "aes256ctr.h" | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
#define UNUSED(x) (void)x | |||
/************************************************* | |||
* Name: expand_mat | |||
* | |||
* Description: Implementation of ExpandA. Generates matrix A with uniformly | |||
* random coefficients a_{i,j} by performing rejection | |||
* sampling on the output stream of SHAKE128(rho|j|i) | |||
* or AES256CTR(rho,j|i). | |||
* | |||
* Arguments: - polyvecl mat[K]: output matrix | |||
* - const uint8_t rho[]: byte array containing seed rho | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { | |||
unsigned int i, j; | |||
uint64_t nonce; | |||
aes256ctr_ctx state; | |||
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&state, rho, 0); | |||
for (i = 0; i < K; i++) { | |||
for (j = 0; j < L; j++) { | |||
nonce = (i << 8) + j; | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&mat[i].vec[j], &state); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&mat[i].vec[j]); | |||
} | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); | |||
} | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length L **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length L | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add | |||
* | |||
* Description: Add vectors of polynomials of length L. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyvecl *w: pointer to output vector | |||
* - const polyvecl *u: pointer to first summand | |||
* - const polyvecl *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length L. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyvecl *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_invntt_tomont(polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply vectors of polynomials of length L, multiply | |||
* resulting vector by 2^{-32} and add (accumulate) polynomials | |||
* in it. Input/output vectors are in NTT domain representation. | |||
* | |||
* Arguments: - poly *w: output polynomial | |||
* - const polyvecl *u: pointer to first input vector | |||
* - const polyvecl *v: pointer to second input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length L. | |||
* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(). | |||
* | |||
* Arguments: - const polyvecl *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < L; ++i) { | |||
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/**************************************************************/ | |||
/************ Vectors of polynomials of length K **************/ | |||
/**************************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to representatives in [-6283009,6283007]. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq | |||
* | |||
* Description: For all coefficients of polynomials in vector of length K | |||
* add Q if coefficient is negative. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze | |||
* | |||
* Description: Reduce coefficients of polynomials in vector of length K | |||
* to standard representatives. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add | |||
* | |||
* Description: Add vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first summand | |||
* - const polyveck *v: pointer to second summand | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub | |||
* | |||
* Description: Subtract vectors of polynomials of length K. | |||
* No modular reduction is performed. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector | |||
* - const polyveck *u: pointer to first input vector | |||
* - const polyveck *v: pointer to second input vector to be | |||
* subtracted from first input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl | |||
* | |||
* Description: Multiply vector of polynomials of Length K by 2^D without modular | |||
* reduction. Assumes input coefficients to be less than 2^{31-D}. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt | |||
* | |||
* Description: Forward NTT of all polynomials in vector of length K. Output | |||
* coefficients can be up to 16*Q larger than input coefficients. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont | |||
* | |||
* Description: Inverse NTT and multiplication by 2^{32} of polynomials | |||
* in vector of length K. Input coefficients need to be less | |||
* than 2*Q. | |||
* | |||
* Arguments: - polyveck *v: pointer to input/output vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&v->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm | |||
* | |||
* Description: Check infinity norm of polynomials in vector of length K. | |||
* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(). | |||
* | |||
* Arguments: - const polyveck *v: pointer to vector | |||
* - int32_t B: norm bound | |||
* | |||
* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 | |||
* and 1 otherwise. | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&v->vec[i], bound)) { | |||
return 1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 | |||
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be | |||
* standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose | |||
* | |||
* Description: For all coefficients a of polynomials in vector of length K, | |||
* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 | |||
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we | |||
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. | |||
* Assumes coefficients to be standard representatives. | |||
* | |||
* Arguments: - polyveck *v1: pointer to output vector of polynomials with | |||
* coefficients a1 | |||
* - polyveck *v0: pointer to output vector of polynomials with | |||
* coefficients a0 | |||
* - const polyveck *v: pointer to input vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint | |||
* | |||
* Description: Compute hint vector. | |||
* | |||
* Arguments: - uint8_t *hint: pointer to output hint array | |||
* - const polyveck *v0: pointer to low part of input vector | |||
* - const polyveck *v1: pointer to high part of input vector | |||
* | |||
* Returns number of 1 bits. | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { | |||
unsigned int i, n = 0; | |||
for (i = 0; i < K; ++i) { | |||
n += PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); | |||
} | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint | |||
* | |||
* Description: Use hint vector to correct the high bits of input vector. | |||
* | |||
* Arguments: - polyveck *w: pointer to output vector of polynomials with | |||
* corrected high bits | |||
* - const polyveck *u: pointer to input vector | |||
* - const polyveck *h: pointer to input hint vector | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); | |||
} | |||
} | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { | |||
unsigned int i; | |||
for (i = 0; i < K; ++i) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); | |||
} | |||
} |
@@ -0,0 +1,64 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_POLYVEC_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
/* Vectors of polynomials of length L */ | |||
typedef struct { | |||
poly vec[L]; | |||
} polyvecl; | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_invntt_tomont(polyvecl *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, | |||
const polyvecl *u, | |||
const polyvecl *v); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); | |||
/* Vectors of polynomials of length K */ | |||
typedef struct { | |||
poly vec[K]; | |||
} polyveck; | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze(polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl(polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); | |||
#endif |
@@ -0,0 +1,394 @@ | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
const uint8_t PQCLEAN_DILITHIUM2AES_AVX2_idxlut[256][8] = { | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 0, 0, 0, 0, 0}, | |||
{ 3, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 0, 0, 0, 0, 0}, | |||
{ 2, 3, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 0, 0, 0, 0}, | |||
{ 4, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 0, 0, 0, 0}, | |||
{ 3, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 0, 0, 0, 0}, | |||
{ 2, 3, 4, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 0, 0, 0}, | |||
{ 5, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 0, 0, 0, 0, 0}, | |||
{ 2, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 0, 0, 0, 0}, | |||
{ 3, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 0, 0, 0, 0}, | |||
{ 2, 3, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 0, 0, 0}, | |||
{ 4, 5, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 0, 0, 0, 0}, | |||
{ 2, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 0, 0, 0}, | |||
{ 3, 4, 5, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 0, 0, 0}, | |||
{ 2, 3, 4, 5, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 0, 0}, | |||
{ 6, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 6, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 6, 0, 0, 0, 0}, | |||
{ 3, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 6, 0, 0, 0, 0}, | |||
{ 2, 3, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 6, 0, 0, 0}, | |||
{ 4, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 6, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 6, 0, 0, 0}, | |||
{ 3, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 6, 0, 0, 0}, | |||
{ 2, 3, 4, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 6, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 6, 0, 0}, | |||
{ 5, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 6, 0, 0, 0, 0}, | |||
{ 2, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 6, 0, 0, 0}, | |||
{ 3, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 6, 0, 0, 0}, | |||
{ 2, 3, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 6, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 6, 0, 0}, | |||
{ 4, 5, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 6, 0, 0, 0}, | |||
{ 2, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 6, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 6, 0, 0}, | |||
{ 3, 4, 5, 6, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 6, 0, 0}, | |||
{ 2, 3, 4, 5, 6, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 6, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 6, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 6, 0}, | |||
{ 7, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 1, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 7, 0, 0, 0, 0, 0}, | |||
{ 2, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 2, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 7, 0, 0, 0, 0}, | |||
{ 3, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 7, 0, 0, 0, 0}, | |||
{ 2, 3, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 3, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 7, 0, 0, 0}, | |||
{ 4, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 7, 0, 0, 0, 0}, | |||
{ 2, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 7, 0, 0, 0}, | |||
{ 3, 4, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 7, 0, 0, 0}, | |||
{ 2, 3, 4, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 4, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 7, 0, 0}, | |||
{ 5, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 7, 0, 0, 0, 0}, | |||
{ 2, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 7, 0, 0, 0}, | |||
{ 3, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 7, 0, 0, 0}, | |||
{ 2, 3, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 7, 0, 0}, | |||
{ 4, 5, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 1, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 7, 0, 0, 0}, | |||
{ 2, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 7, 0, 0, 0}, | |||
{ 1, 2, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 7, 0, 0}, | |||
{ 3, 4, 5, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 1, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 7, 0, 0}, | |||
{ 2, 3, 4, 5, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 7, 0, 0}, | |||
{ 1, 2, 3, 4, 5, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 7, 0}, | |||
{ 6, 7, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 1, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 1, 6, 7, 0, 0, 0, 0}, | |||
{ 2, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 2, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 2, 6, 7, 0, 0, 0}, | |||
{ 3, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 3, 6, 7, 0, 0, 0}, | |||
{ 2, 3, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 3, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 3, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 3, 6, 7, 0, 0}, | |||
{ 4, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 4, 6, 7, 0, 0, 0}, | |||
{ 2, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 4, 6, 7, 0, 0}, | |||
{ 3, 4, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 1, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 4, 6, 7, 0, 0}, | |||
{ 2, 3, 4, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 4, 6, 7, 0, 0}, | |||
{ 1, 2, 3, 4, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 4, 6, 7, 0}, | |||
{ 5, 6, 7, 0, 0, 0, 0, 0}, | |||
{ 0, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 1, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 1, 5, 6, 7, 0, 0, 0}, | |||
{ 2, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 2, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 2, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 2, 5, 6, 7, 0, 0}, | |||
{ 3, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 3, 5, 6, 7, 0, 0}, | |||
{ 2, 3, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 3, 5, 6, 7, 0, 0}, | |||
{ 1, 2, 3, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 3, 5, 6, 7, 0}, | |||
{ 4, 5, 6, 7, 0, 0, 0, 0}, | |||
{ 0, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 1, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 1, 4, 5, 6, 7, 0, 0}, | |||
{ 2, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 2, 4, 5, 6, 7, 0, 0}, | |||
{ 1, 2, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 2, 4, 5, 6, 7, 0}, | |||
{ 3, 4, 5, 6, 7, 0, 0, 0}, | |||
{ 0, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 1, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 1, 3, 4, 5, 6, 7, 0}, | |||
{ 2, 3, 4, 5, 6, 7, 0, 0}, | |||
{ 0, 2, 3, 4, 5, 6, 7, 0}, | |||
{ 1, 2, 3, 4, 5, 6, 7, 0}, | |||
{ 0, 1, 2, 3, 4, 5, 6, 7} | |||
}; | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i d, tmp; | |||
const __m256i bound = _mm256_set1_epi32(Q); | |||
const __m256i mask = _mm256_set1_epi32(0x7FFFFF); | |||
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, | |||
-1, 9, 8, 7, -1, 6, 5, 4, | |||
-1, 11, 10, 9, -1, 8, 7, 6, | |||
-1, 5, 4, 3, -1, 2, 1, 0); | |||
ctr = pos = 0; | |||
while (pos <= REJ_UNIFORM_BUFLEN - 24) { | |||
d = _mm256_loadu_si256((__m256i *)&buf[pos]); | |||
d = _mm256_permute4x64_epi64(d, 0x94); | |||
d = _mm256_shuffle_epi8(d, idx8); | |||
d = _mm256_and_si256(d, mask); | |||
pos += 24; | |||
tmp = _mm256_sub_epi32(d, bound); | |||
good = _mm256_movemask_ps((__m256)tmp); | |||
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good])); | |||
d = _mm256_permutevar8x32_epi32(d, tmp); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], d); | |||
ctr += _mm_popcnt_u32(good); | |||
} | |||
return ctr; | |||
} | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { | |||
unsigned int ctr, pos; | |||
uint32_t good; | |||
__m256i f0, f1, f2; | |||
__m128i g0, g1; | |||
const __m256i mask = _mm256_set1_epi8(15); | |||
const __m256i eta = _mm256_set1_epi8(ETA); | |||
const __m256i bound = mask; | |||
const __m256i v = _mm256_set1_epi32(-6560); | |||
const __m256i p = _mm256_set1_epi32(5); | |||
ctr = pos = 0; | |||
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { | |||
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); | |||
f1 = _mm256_slli_epi16(f0, 4); | |||
f0 = _mm256_or_si256(f0, f1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_sub_epi8(f0, bound); | |||
f0 = _mm256_sub_epi8(eta, f0); | |||
good = _mm256_movemask_epi8(f1); | |||
g0 = _mm256_castsi256_si128(f0); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm256_extracti128_si256(f0, 1); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good & 0xFF); | |||
good >>= 8; | |||
pos += 4; | |||
if (ctr > N - 8) { | |||
break; | |||
} | |||
g0 = _mm_bsrli_si128(g0, 8); | |||
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good]); | |||
g1 = _mm_shuffle_epi8(g0, g1); | |||
f1 = _mm256_cvtepi8_epi32(g1); | |||
f2 = _mm256_mulhrs_epi16(f1, v); | |||
f2 = _mm256_mullo_epi16(f2, p); | |||
f1 = _mm256_add_epi32(f1, f2); | |||
_mm256_storeu_si256((__m256i *)&r[ctr], f1); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 4; | |||
} | |||
uint32_t t0, t1; | |||
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { | |||
t0 = buf[pos] & 0x0F; | |||
t1 = buf[pos++] >> 4; | |||
if (t0 < 15) { | |||
t0 = t0 - (205 * t0 >> 10) * 5; | |||
r[ctr++] = 2 - t0; | |||
} | |||
if (t1 < 15 && ctr < N) { | |||
t1 = t1 - (205 * t1 >> 10) * 5; | |||
r[ctr++] = 2 - t1; | |||
} | |||
} | |||
return ctr; | |||
} |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_REJSAMPLE_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_REJSAMPLE_H | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) | |||
#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) | |||
extern const uint8_t PQCLEAN_DILITHIUM2AES_AVX2_idxlut[256][8]; | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); | |||
#endif |
@@ -0,0 +1,157 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include "rounding.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
#define _mm256_blendv_epi32(a,b,mask) \ | |||
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ | |||
_mm256_castsi256_ps(b), \ | |||
_mm256_castsi256_ps(mask))) | |||
/************************************************* | |||
* Name: power2round | |||
* | |||
* Description: For finite field elements a, compute a0, a1 such that | |||
* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. | |||
* Assumes a to be positive standard representative. | |||
* | |||
* Arguments: - __m256i *a1: output array of length N/8 with high bits | |||
* - __m256i *a0: output array of length N/8 with low bits a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1; | |||
const __m256i mask = _mm256_set1_epi32(-(1 << D)); | |||
const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); | |||
for (i = 0; i < N / 8; ++i) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, half); | |||
f0 = _mm256_and_si256(f1, mask); | |||
f1 = _mm256_srli_epi32(f1, D); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: decompose | |||
* | |||
* Description: For finite field element a, compute high and low parts a0, a1 such | |||
* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except | |||
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and | |||
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard | |||
* representative. | |||
* | |||
* Arguments: - __m256i *a1: output array of length N/8 with high parts | |||
* - __m256i *a0: output array of length N/8 with low parts a0 | |||
* - const __m256i *a: input array of length N/8 | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { | |||
unsigned int i; | |||
__m256i f, f0, f1, t; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]); | |||
const __m256i hq = _mm256_srli_epi32(q, 1); | |||
const __m256i v = _mm256_set1_epi32(11275); | |||
const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); | |||
const __m256i off = _mm256_set1_epi32(127); | |||
const __m256i shift = _mm256_set1_epi32(128); | |||
const __m256i max = _mm256_set1_epi32(43); | |||
const __m256i zero = _mm256_setzero_si256(); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a[i]); | |||
f1 = _mm256_add_epi32(f, off); | |||
f1 = _mm256_srli_epi32(f1, 7); | |||
f1 = _mm256_mulhi_epu16(f1, v); | |||
f1 = _mm256_mulhrs_epi16(f1, shift); | |||
t = _mm256_sub_epi32(max, f1); | |||
f1 = _mm256_blendv_epi32(f1, zero, t); | |||
f0 = _mm256_mullo_epi32(f1, alpha); | |||
f0 = _mm256_sub_epi32(f, f0); | |||
f = _mm256_cmpgt_epi32(f0, hq); | |||
f = _mm256_and_si256(f, q); | |||
f0 = _mm256_sub_epi32(f0, f); | |||
_mm256_store_si256(&a1[i], f1); | |||
_mm256_store_si256(&a0[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: make_hint | |||
* | |||
* Description: Compute indices of polynomial coefficients whose low bits | |||
* overflow into the high bits. | |||
* | |||
* Arguments: - uint8_t *hint: hint array | |||
* - const __m256i *a0: low bits of input elements | |||
* - const __m256i *a1: high bits of input elements | |||
* | |||
* Returns number of overflowing low bits | |||
**************************************************/ | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { | |||
unsigned int i, n = 0; | |||
__m256i f0, f1, g0, g1; | |||
uint32_t bad; | |||
uint64_t idx; | |||
const __m256i low = _mm256_set1_epi32(-GAMMA2); | |||
const __m256i high = _mm256_set1_epi32(GAMMA2); | |||
for (i = 0; i < N / 8; ++i) { | |||
f0 = _mm256_load_si256(&a0[i]); | |||
f1 = _mm256_load_si256(&a1[i]); | |||
g0 = _mm256_abs_epi32(f0); | |||
g0 = _mm256_cmpgt_epi32(g0, high); | |||
g1 = _mm256_cmpeq_epi32(f0, low); | |||
g1 = _mm256_sign_epi32(g1, f1); | |||
g0 = _mm256_or_si256(g0, g1); | |||
bad = _mm256_movemask_ps((__m256)g0); | |||
memcpy(&idx, PQCLEAN_DILITHIUM2AES_AVX2_idxlut[bad], 8); | |||
idx += (uint64_t)0x0808080808080808 * i; | |||
memcpy(&hint[n], &idx, 8); | |||
n += _mm_popcnt_u32(bad); | |||
} | |||
return n; | |||
} | |||
/************************************************* | |||
* Name: use_hint | |||
* | |||
* Description: Correct high parts according to hint. | |||
* | |||
* Arguments: - __m256i *b: output array of length N/8 with corrected high parts | |||
* - const __m256i *a: input array of length N/8 | |||
* - const __m256i *a: input array of length N/8 with hint bits | |||
* | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { | |||
unsigned int i; | |||
__m256i a0[N / 8]; | |||
__m256i f, g, h, t; | |||
const __m256i zero = _mm256_setzero_si256(); | |||
const __m256i max = _mm256_set1_epi32(43); | |||
PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(b, a0, a); | |||
for (i = 0; i < N / 8; i++) { | |||
f = _mm256_load_si256(&a0[i]); | |||
g = _mm256_load_si256(&b[i]); | |||
h = _mm256_load_si256(&hint[i]); | |||
t = _mm256_blendv_epi32(zero, h, f); | |||
t = _mm256_slli_epi32(t, 1); | |||
h = _mm256_sub_epi32(h, t); | |||
g = _mm256_add_epi32(g, h); | |||
g = _mm256_blendv_epi32(g, max, g); | |||
f = _mm256_cmpgt_epi32(g, max); | |||
g = _mm256_blendv_epi32(g, zero, f); | |||
_mm256_store_si256(&b[i], g); | |||
} | |||
} |
@@ -0,0 +1,12 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_ROUNDING_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_ROUNDING_H | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); | |||
unsigned int PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); | |||
void PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); | |||
#endif |
@@ -0,0 +1,54 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.text | |||
nttunpack128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
#store | |||
vmovdqa %ymm9,(%rdi) | |||
vmovdqa %ymm8,32(%rdi) | |||
vmovdqa %ymm7,64(%rdi) | |||
vmovdqa %ymm6,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm3,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx) | |||
.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx) | |||
cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx): | |||
_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx): | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
ret |
@@ -0,0 +1,25 @@ | |||
.macro shuffle8 r0,r1,r2,r3 | |||
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle4 r0,r1,r2,r3 | |||
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 | |||
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vpsllq $32,%ymm\r1,%ymm\r2 | |||
vmovsldup %ymm\r1,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrlq $32,%ymm\r0,%ymm\r0 | |||
#vmovshdup %ymm\r0,%ymm\r0 | |||
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle1 r0,r1,r2,r3 | |||
vpslld $16,%ymm\r1,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrld $16,%ymm\r0,%ymm\r0 | |||
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm |
@@ -0,0 +1,425 @@ | |||
#include "aes256ctr.h" | |||
#include "align.h" | |||
#include "fips202.h" | |||
#include "packing.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "sign.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair | |||
* | |||
* Description: Generates public and private key. | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (allocated | |||
* array of PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (allocated | |||
* array of PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { | |||
unsigned int i; | |||
uint8_t seedbuf[3 * SEEDBYTES]; | |||
const uint8_t *rho, *rhoprime, *key; | |||
uint64_t nonce; | |||
aes256ctr_ctx aesctx; | |||
polyvecl rowbuf[1]; | |||
polyvecl s1, *row = rowbuf; | |||
polyveck s2; | |||
poly t1, t0; | |||
/* Get randomness for rho, rhoprime and key */ | |||
randombytes(seedbuf, SEEDBYTES); | |||
shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); | |||
rho = seedbuf; | |||
rhoprime = seedbuf + SEEDBYTES; | |||
key = seedbuf + 2 * SEEDBYTES; | |||
/* Store rho, key */ | |||
memcpy(pk, rho, SEEDBYTES); | |||
memcpy(sk, rho, SEEDBYTES); | |||
memcpy(sk + SEEDBYTES, key, SEEDBYTES); | |||
/* Sample short vectors s1 and s2 */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); | |||
for (i = 0; i < L; ++i) { | |||
nonce = i; | |||
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(&s1.vec[i], &aesctx); | |||
} | |||
for (i = 0; i < K; ++i) { | |||
nonce = L + i; | |||
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(&s2.vec[i], &aesctx); | |||
} | |||
/* Pack secret vectors */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); | |||
} | |||
for (i = 0; i < K; i++) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); | |||
} | |||
/* Transform s1 */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rho, 0); | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
for (unsigned int j = 0; j < L; j++) { | |||
nonce = (i << 8) + j; | |||
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&row->vec[j]); | |||
} | |||
/* Compute inner-product */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&t1); | |||
/* Add error polynomial */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&t1, &t1, &s2.vec[i]); | |||
/* Round t and pack t1, t0 */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&t1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(&t1, &t0, &t1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); | |||
} | |||
/* Compute CRH(rho, t1) and store in secret key */ | |||
crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature | |||
* | |||
* Description: Computes signature. | |||
* | |||
* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) | |||
* - size_t *siglen: pointer to output length of signature | |||
* - uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
unsigned int i, n, pos; | |||
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; | |||
uint8_t *rho, *tr, *key, *mu, *rhoprime; | |||
uint8_t hintbuf[N]; | |||
uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
uint64_t nonce = 0; | |||
polyvecl mat[K], s1, z; | |||
polyveck t0, s2, w1; | |||
poly c, tmp; | |||
union { | |||
polyvecl y; | |||
polyveck w0; | |||
} tmpv; | |||
shake256incctx state; | |||
rho = seedbuf; | |||
tr = rho + SEEDBYTES; | |||
key = tr + CRHBYTES; | |||
mu = key + SEEDBYTES; | |||
rhoprime = mu + CRHBYTES; | |||
PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); | |||
/* Compute CRH(tr, msg) */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, tr, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
crh(rhoprime, key, SEEDBYTES + CRHBYTES); | |||
/* Expand matrix and transform vectors */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(mat, rho); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&s1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(&s2); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(&t0); | |||
aes256ctr_ctx aesctx; | |||
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); | |||
rej: | |||
/* Sample intermediate vector y */ | |||
for (i = 0; i < L; ++i) { | |||
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
nonce++; | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(&z.vec[i], &aesctx); | |||
} | |||
/* Matrix-vector product */ | |||
tmpv.y = z; | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&tmpv.y); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(&w1); | |||
/* Decompose w and call the random oracle */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(&w1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(sig, &w1); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(sig, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&c); | |||
/* Compute z, reject if it reveals secret */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&z.vec[i]); | |||
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { | |||
goto rej; | |||
} | |||
} | |||
/* Zero hint vector in signature */ | |||
pos = 0; | |||
memset(hint, 0, OMEGA); | |||
for (i = 0; i < K; i++) { | |||
/* Check that subtracting cs2 does not change high bits of w and low bits | |||
* do not reveal secret information */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&tmpv.w0.vec[i]); | |||
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { | |||
goto rej; | |||
} | |||
/* Compute hints */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&tmp); | |||
if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&tmp, GAMMA2)) { | |||
goto rej; | |||
} | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); | |||
n = PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); | |||
if (pos + n > OMEGA) { | |||
goto rej; | |||
} | |||
/* Store hints in signature */ | |||
memcpy(&hint[pos], hintbuf, n); | |||
hint[OMEGA + i] = pos = pos + n; | |||
} | |||
/* Pack z into signature */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); | |||
} | |||
*siglen = PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign | |||
* | |||
* Description: Compute signed message. | |||
* | |||
* Arguments: - uint8_t *sm: pointer to output signed message (allocated | |||
* array with PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + mlen bytes), | |||
* can be equal to m | |||
* - size_t *smlen: pointer to output length of signed | |||
* message | |||
* - const uint8_t *m: pointer to message to be signed | |||
* - size_t mlen: length of message | |||
* - const uint8_t *sk: pointer to bit-packed secret key | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { | |||
size_t i; | |||
for (i = 0; i < mlen; ++i) { | |||
sm[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; | |||
} | |||
PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, mlen, sk); | |||
*smlen += mlen; | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify | |||
* | |||
* Description: Verifies signature. | |||
* | |||
* Arguments: - uint8_t *m: pointer to input signature | |||
* - size_t siglen: length of signature | |||
* - const uint8_t *m: pointer to message | |||
* - size_t mlen: length of message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signature could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { | |||
unsigned int i, j, pos = 0; | |||
/* PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack writes additional 14 bytes */ | |||
ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; | |||
uint8_t mu[CRHBYTES]; | |||
const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; | |||
uint64_t nonce; | |||
aes256ctr_ctx aesctx; | |||
polyvecl rowbuf[1]; | |||
polyvecl *row = rowbuf; | |||
polyvecl z; | |||
poly c, w1, h; | |||
shake256incctx state; | |||
if (siglen != PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) { | |||
return -1; | |||
} | |||
/* Compute CRH(CRH(rho, t1), msg) */ | |||
crh(mu, pk, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES); | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, m, mlen); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(mu, CRHBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
/* Expand PQCLEAN_DILITHIUM2AES_AVX2_challenge */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(&c, sig); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&c); | |||
/* Unpack z; shortness follows from unpacking */ | |||
for (i = 0; i < L; i++) { | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&z.vec[i]); | |||
} | |||
PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, pk, 0); | |||
for (i = 0; i < K; i++) { | |||
/* Expand matrix row */ | |||
for (j = 0; j < L; j++) { | |||
nonce = (i << 8) + j; | |||
aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&row->vec[j]); | |||
} | |||
/* Compute i-th row of Az - c2^Dt1 */ | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(&h); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&h); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&h, &c, &h); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&w1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&w1); | |||
/* Get hint polynomial and reconstruct w1 */ | |||
memset(h.vec, 0, sizeof(poly)); | |||
if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { | |||
return -1; | |||
} | |||
for (j = pos; j < hint[OMEGA + i]; ++j) { | |||
/* Coefficients are ordered for strong unforgeability */ | |||
if (j > pos && hint[j] <= hint[j - 1]) { | |||
return -1; | |||
} | |||
h.coeffs[hint[j]] = 1; | |||
} | |||
pos = hint[OMEGA + i]; | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&w1); | |||
PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(&w1, &w1, &h); | |||
PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); | |||
} | |||
/* Extra indices are zero for strong unforgeability */ | |||
for (j = pos; j < OMEGA; ++j) { | |||
if (hint[j]) { | |||
return -1; | |||
} | |||
} | |||
/* Call random oracle and verify PQCLEAN_DILITHIUM2AES_AVX2_challenge */ | |||
shake256_inc_init(&state); | |||
shake256_inc_absorb(&state, mu, CRHBYTES); | |||
shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); | |||
shake256_inc_finalize(&state); | |||
shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); | |||
shake256_inc_ctx_release(&state); | |||
for (i = 0; i < SEEDBYTES; ++i) { | |||
if (buf.coeffs[i] != sig[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open | |||
* | |||
* Description: Verify signed message. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output message (allocated | |||
* array with smlen bytes), can be equal to sm | |||
* - size_t *mlen: pointer to output length of message | |||
* - const uint8_t *sm: pointer to signed message | |||
* - size_t smlen: length of signed message | |||
* - const uint8_t *pk: pointer to bit-packed public key | |||
* | |||
* Returns 0 if signed message could be verified correctly and -1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { | |||
size_t i; | |||
if (smlen < PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) { | |||
goto badsig; | |||
} | |||
*mlen = smlen - PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES; | |||
if (PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, *mlen, pk)) { | |||
goto badsig; | |||
} else { | |||
/* All good, copy msg, return 0 */ | |||
for (i = 0; i < *mlen; ++i) { | |||
m[i] = sm[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + i]; | |||
} | |||
return 0; | |||
} | |||
badsig: | |||
/* Signature verification failed */ | |||
*mlen = -1; | |||
for (i = 0; i < smlen; ++i) { | |||
m[i] = 0; | |||
} | |||
return -1; | |||
} |
@@ -0,0 +1,29 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_SIGN_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_SIGN_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_DILITHIUM2AES_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *sk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, | |||
const uint8_t *m, size_t mlen, | |||
const uint8_t *pk); | |||
int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, | |||
const uint8_t *sm, size_t smlen, | |||
const uint8_t *pk); | |||
#endif |
@@ -0,0 +1,25 @@ | |||
#ifndef PQCLEAN_DILITHIUM2AES_AVX2_SYMMETRIC_H | |||
#define PQCLEAN_DILITHIUM2AES_AVX2_SYMMETRIC_H | |||
#include "aes256ctr.h" | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef aes256ctr_ctx stream128_state; | |||
typedef aes256ctr_ctx stream256_state; | |||
#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES | |||
#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES | |||
#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) | |||
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) | |||
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define stream128_release(STATE) | |||
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) | |||
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define stream256_release(STATE) | |||
#endif |
@@ -0,0 +1,5 @@ | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
comments on top of the respective files. |
@@ -0,0 +1,19 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libdilithium2aes_clean.a | |||
HEADERS=aes256ctr.h api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h | |||
OBJECTS=aes256ctr.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-aes.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |