diff --git a/crypto_sign/dilithium2/avx2/invntt.s b/crypto_sign/dilithium2/avx2/invntt.s index 7ae2b4e3..3a943f62 100644 --- a/crypto_sign/dilithium2/avx2/invntt.s +++ b/crypto_sign/dilithium2/avx2/invntt.s @@ -1,6 +1,6 @@ .include "shuffle.inc" -.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 +.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 vpaddd %ymm2,%ymm\l0,%ymm12 vpaddd %ymm2,%ymm\l1,%ymm13 vpaddd %ymm2,%ymm\l2,%ymm14 @@ -121,7 +121,7 @@ level2: #PQCLEAN_DILITHIUM2_AVX2_zetas vpmovzxdq 96(%rdx),%ymm3 -butterfly 4,5,6,7,8,9,10,11 3,3 +butterfly 4,5,6,7,8,9,10,11,3,3 #shuffle shuffle4 4,5,3,5 @@ -135,7 +135,7 @@ vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 116(%rdx),%ymm15 vpblendd $0xF0,%ymm15,%ymm14,%ymm10 -butterfly 3,4,6,8,5,7,9,11 10,10 +butterfly 3,4,6,8,5,7,9,11,10,10 #shuffle shuffle8 3,4,10,4 @@ -147,7 +147,7 @@ level4: #PQCLEAN_DILITHIUM2_AVX2_zetas vpbroadcastd 120(%rdx),%ymm9 -butterfly 10,3,6,5,4,8,7,11 9,9 +butterfly 10,3,6,5,4,8,7,11,9,9 #store vmovdqa %ymm10,(%rdi) @@ -233,7 +233,7 @@ level7: #PQCLEAN_DILITHIUM2_AVX2_zetas vpbroadcastd 24(%rdx),%ymm3 -butterfly 4,5,6,7,8,9,10,11 3,3 +butterfly 4,5,6,7,8,9,10,11,3,3 #consts vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3 diff --git a/crypto_sign/dilithium2/avx2/ntt.h b/crypto_sign/dilithium2/avx2/ntt.h index 53837202..681f6e3f 100644 --- a/crypto_sign/dilithium2/avx2/ntt.h +++ b/crypto_sign/dilithium2/avx2/ntt.h @@ -6,21 +6,31 @@ #include "nttconsts.h" #include "params.h" -void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas); -void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas); +void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx( + uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas +); +void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx( + uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas +); -void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas_inv); -void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas_inv); +void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx( + uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas_inv +); +void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx( + uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas_inv +); -void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); -void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx( + uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx( + uint32_t *c, const uint32_t *a, const uint32_t *b); #endif diff --git a/crypto_sign/dilithium2/avx2/ntt.s b/crypto_sign/dilithium2/avx2/ntt.s index e69a5f89..ed329dd3 100644 --- a/crypto_sign/dilithium2/avx2/ntt.s +++ b/crypto_sign/dilithium2/avx2/ntt.s @@ -1,6 +1,6 @@ .include "shuffle.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 #mul vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 @@ -68,7 +68,7 @@ level1: vpbroadcastd 4(%rdx),%ymm12 vpbroadcastd 8(%rdx),%ymm13 -butterfly 4,5,8,9,6,7,10,11 12,12,13,13 +butterfly 4,5,8,9,6,7,10,11,12,12,13,13 level2: #PQCLEAN_DILITHIUM2_AVX2_zetas @@ -77,7 +77,7 @@ vpbroadcastd 16(%rdx),%ymm13 vpbroadcastd 20(%rdx),%ymm14 vpbroadcastd 24(%rdx),%ymm15 -butterfly 4,6,8,10,5,7,9,11 12,13,14,15 +butterfly 4,6,8,10,5,7,9,11,12,13,14,15 #store vmovdqa %ymm4,(%rdi) @@ -125,7 +125,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly 3,8,4,9,5,10,6,11 12,12,12,12 +butterfly 3,8,4,9,5,10,6,11,12,12,12,12 level5: #PQCLEAN_DILITHIUM2_AVX2_zetas @@ -136,14 +136,14 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly 7,5,3,10,8,6,4,11 12,12,12,12 +butterfly 7,5,3,10,8,6,4,11,12,12,12,12 level6: #PQCLEAN_DILITHIUM2_AVX2_zetas vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 44(%rdx),%ymm13 -butterfly 7,5,8,6,3,10,4,11 12,12,13,13 +butterfly 7,5,8,6,3,10,4,11,12,12,13,13 level7: #PQCLEAN_DILITHIUM2_AVX2_zetas @@ -152,7 +152,7 @@ vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 92(%rdx),%ymm14 vpmovzxdq 108(%rdx),%ymm15 -butterfly 7,3,8,4,5,10,6,11 12,13,14,15 +butterfly 7,3,8,4,5,10,6,11,12,13,14,15 #store vpsllq $32,%ymm5,%ymm5 diff --git a/crypto_sign/dilithium3/avx2/invntt.s b/crypto_sign/dilithium3/avx2/invntt.s index 42fe13d6..a522abe1 100644 --- a/crypto_sign/dilithium3/avx2/invntt.s +++ b/crypto_sign/dilithium3/avx2/invntt.s @@ -1,6 +1,6 @@ .include "shuffle.inc" -.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 +.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 vpaddd %ymm2,%ymm\l0,%ymm12 vpaddd %ymm2,%ymm\l1,%ymm13 vpaddd %ymm2,%ymm\l2,%ymm14 @@ -121,7 +121,7 @@ level2: #PQCLEAN_DILITHIUM3_AVX2_zetas vpmovzxdq 96(%rdx),%ymm3 -butterfly 4,5,6,7,8,9,10,11 3,3 +butterfly 4,5,6,7,8,9,10,11,3,3 #shuffle shuffle4 4,5,3,5 @@ -135,7 +135,7 @@ vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 116(%rdx),%ymm15 vpblendd $0xF0,%ymm15,%ymm14,%ymm10 -butterfly 3,4,6,8,5,7,9,11 10,10 +butterfly 3,4,6,8,5,7,9,11,10,10 #shuffle shuffle8 3,4,10,4 @@ -147,7 +147,7 @@ level4: #PQCLEAN_DILITHIUM3_AVX2_zetas vpbroadcastd 120(%rdx),%ymm9 -butterfly 10,3,6,5,4,8,7,11 9,9 +butterfly 10,3,6,5,4,8,7,11,9,9 #store vmovdqa %ymm10,(%rdi) @@ -233,7 +233,7 @@ level7: #PQCLEAN_DILITHIUM3_AVX2_zetas vpbroadcastd 24(%rdx),%ymm3 -butterfly 4,5,6,7,8,9,10,11 3,3 +butterfly 4,5,6,7,8,9,10,11,3,3 #consts vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3 diff --git a/crypto_sign/dilithium3/avx2/ntt.h b/crypto_sign/dilithium3/avx2/ntt.h index a5474dc6..cf0a0a58 100644 --- a/crypto_sign/dilithium3/avx2/ntt.h +++ b/crypto_sign/dilithium3/avx2/ntt.h @@ -6,21 +6,31 @@ #include "nttconsts.h" #include "params.h" -void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas); -void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas); +void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx( + uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas +); +void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx( + uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas +); -void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas_inv); -void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas_inv); +void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx( + uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas_inv +); +void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx( + uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas_inv +); -void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); -void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx( + uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx( + uint32_t *c, const uint32_t *a, const uint32_t *b); #endif diff --git a/crypto_sign/dilithium3/avx2/ntt.s b/crypto_sign/dilithium3/avx2/ntt.s index 9fb961c6..6a17915d 100644 --- a/crypto_sign/dilithium3/avx2/ntt.s +++ b/crypto_sign/dilithium3/avx2/ntt.s @@ -1,6 +1,6 @@ .include "shuffle.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 #mul vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 @@ -68,7 +68,7 @@ level1: vpbroadcastd 4(%rdx),%ymm12 vpbroadcastd 8(%rdx),%ymm13 -butterfly 4,5,8,9,6,7,10,11 12,12,13,13 +butterfly 4,5,8,9,6,7,10,11,12,12,13,13 level2: #PQCLEAN_DILITHIUM3_AVX2_zetas @@ -77,7 +77,7 @@ vpbroadcastd 16(%rdx),%ymm13 vpbroadcastd 20(%rdx),%ymm14 vpbroadcastd 24(%rdx),%ymm15 -butterfly 4,6,8,10,5,7,9,11 12,13,14,15 +butterfly 4,6,8,10,5,7,9,11,12,13,14,15 #store vmovdqa %ymm4,(%rdi) @@ -125,7 +125,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly 3,8,4,9,5,10,6,11 12,12,12,12 +butterfly 3,8,4,9,5,10,6,11,12,12,12,12 level5: #PQCLEAN_DILITHIUM3_AVX2_zetas @@ -136,14 +136,14 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly 7,5,3,10,8,6,4,11 12,12,12,12 +butterfly 7,5,3,10,8,6,4,11,12,12,12,12 level6: #PQCLEAN_DILITHIUM3_AVX2_zetas vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 44(%rdx),%ymm13 -butterfly 7,5,8,6,3,10,4,11 12,12,13,13 +butterfly 7,5,8,6,3,10,4,11,12,12,13,13 level7: #PQCLEAN_DILITHIUM3_AVX2_zetas @@ -152,7 +152,7 @@ vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 92(%rdx),%ymm14 vpmovzxdq 108(%rdx),%ymm15 -butterfly 7,3,8,4,5,10,6,11 12,13,14,15 +butterfly 7,3,8,4,5,10,6,11,12,13,14,15 #store vpsllq $32,%ymm5,%ymm5 diff --git a/crypto_sign/dilithium4/avx2/invntt.s b/crypto_sign/dilithium4/avx2/invntt.s index e8c4acb0..8f69a004 100644 --- a/crypto_sign/dilithium4/avx2/invntt.s +++ b/crypto_sign/dilithium4/avx2/invntt.s @@ -1,6 +1,6 @@ .include "shuffle.inc" -.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 +.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 vpaddd %ymm2,%ymm\l0,%ymm12 vpaddd %ymm2,%ymm\l1,%ymm13 vpaddd %ymm2,%ymm\l2,%ymm14 @@ -121,7 +121,7 @@ level2: #PQCLEAN_DILITHIUM4_AVX2_zetas vpmovzxdq 96(%rdx),%ymm3 -butterfly 4,5,6,7,8,9,10,11 3,3 +butterfly 4,5,6,7,8,9,10,11,3,3 #shuffle shuffle4 4,5,3,5 @@ -135,7 +135,7 @@ vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 116(%rdx),%ymm15 vpblendd $0xF0,%ymm15,%ymm14,%ymm10 -butterfly 3,4,6,8,5,7,9,11 10,10 +butterfly 3,4,6,8,5,7,9,11,10,10 #shuffle shuffle8 3,4,10,4 @@ -147,7 +147,7 @@ level4: #PQCLEAN_DILITHIUM4_AVX2_zetas vpbroadcastd 120(%rdx),%ymm9 -butterfly 10,3,6,5,4,8,7,11 9,9 +butterfly 10,3,6,5,4,8,7,11,9,9 #store vmovdqa %ymm10,(%rdi) @@ -233,7 +233,7 @@ level7: #PQCLEAN_DILITHIUM4_AVX2_zetas vpbroadcastd 24(%rdx),%ymm3 -butterfly 4,5,6,7,8,9,10,11 3,3 +butterfly 4,5,6,7,8,9,10,11,3,3 #consts vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3 diff --git a/crypto_sign/dilithium4/avx2/ntt.h b/crypto_sign/dilithium4/avx2/ntt.h index 39d02bef..e337cda9 100644 --- a/crypto_sign/dilithium4/avx2/ntt.h +++ b/crypto_sign/dilithium4/avx2/ntt.h @@ -6,21 +6,31 @@ #include "nttconsts.h" #include "params.h" -void PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx(uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas); -void PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx(uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas); +void PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx( + uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas +); +void PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx( + uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas +); -void PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx(uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas_inv); -void PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx(uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas_inv); +void PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx( + uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas_inv +); +void PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx( + uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas_inv +); -void PQCLEAN_DILITHIUM4_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); -void PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM4_AVX2_pointwise_avx( + uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx( + uint32_t *c, const uint32_t *a, const uint32_t *b); #endif diff --git a/crypto_sign/dilithium4/avx2/ntt.s b/crypto_sign/dilithium4/avx2/ntt.s index 12e8f513..692398c5 100644 --- a/crypto_sign/dilithium4/avx2/ntt.s +++ b/crypto_sign/dilithium4/avx2/ntt.s @@ -1,6 +1,6 @@ .include "shuffle.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 #mul vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 @@ -68,7 +68,7 @@ level1: vpbroadcastd 4(%rdx),%ymm12 vpbroadcastd 8(%rdx),%ymm13 -butterfly 4,5,8,9,6,7,10,11 12,12,13,13 +butterfly 4,5,8,9,6,7,10,11,12,12,13,13 level2: #PQCLEAN_DILITHIUM4_AVX2_zetas @@ -77,7 +77,7 @@ vpbroadcastd 16(%rdx),%ymm13 vpbroadcastd 20(%rdx),%ymm14 vpbroadcastd 24(%rdx),%ymm15 -butterfly 4,6,8,10,5,7,9,11 12,13,14,15 +butterfly 4,6,8,10,5,7,9,11,12,13,14,15 #store vmovdqa %ymm4,(%rdi) @@ -125,7 +125,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly 3,8,4,9,5,10,6,11 12,12,12,12 +butterfly 3,8,4,9,5,10,6,11,12,12,12,12 level5: #PQCLEAN_DILITHIUM4_AVX2_zetas @@ -136,14 +136,14 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly 7,5,3,10,8,6,4,11 12,12,12,12 +butterfly 7,5,3,10,8,6,4,11,12,12,12,12 level6: #PQCLEAN_DILITHIUM4_AVX2_zetas vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 44(%rdx),%ymm13 -butterfly 7,5,8,6,3,10,4,11 12,12,13,13 +butterfly 7,5,8,6,3,10,4,11,12,12,13,13 level7: #PQCLEAN_DILITHIUM4_AVX2_zetas @@ -152,7 +152,7 @@ vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 92(%rdx),%ymm14 vpmovzxdq 108(%rdx),%ymm15 -butterfly 7,3,8,4,5,10,6,11 12,13,14,15 +butterfly 7,3,8,4,5,10,6,11,12,13,14,15 #store vpsllq $32,%ymm5,%ymm5