@@ -1,6 +1,6 @@ | |||
.include "shuffle.inc" | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | |||
vpaddd %ymm2,%ymm\l0,%ymm12 | |||
vpaddd %ymm2,%ymm\l1,%ymm13 | |||
vpaddd %ymm2,%ymm\l2,%ymm14 | |||
@@ -121,7 +121,7 @@ level2: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpmovzxdq 96(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 3,3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#shuffle | |||
shuffle4 4,5,3,5 | |||
@@ -135,7 +135,7 @@ vpbroadcastd 112(%rdx),%ymm14 | |||
vpbroadcastd 116(%rdx),%ymm15 | |||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | |||
butterfly 3,4,6,8,5,7,9,11 10,10 | |||
butterfly 3,4,6,8,5,7,9,11,10,10 | |||
#shuffle | |||
shuffle8 3,4,10,4 | |||
@@ -147,7 +147,7 @@ level4: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpbroadcastd 120(%rdx),%ymm9 | |||
butterfly 10,3,6,5,4,8,7,11 9,9 | |||
butterfly 10,3,6,5,4,8,7,11,9,9 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
@@ -233,7 +233,7 @@ level7: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 3,3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3 | |||
@@ -6,21 +6,31 @@ | |||
#include "nttconsts.h" | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas); | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas); | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
#endif |
@@ -1,6 +1,6 @@ | |||
.include "shuffle.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | |||
#mul | |||
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 | |||
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 | |||
@@ -68,7 +68,7 @@ level1: | |||
vpbroadcastd 4(%rdx),%ymm12 | |||
vpbroadcastd 8(%rdx),%ymm13 | |||
butterfly 4,5,8,9,6,7,10,11 12,12,13,13 | |||
butterfly 4,5,8,9,6,7,10,11,12,12,13,13 | |||
level2: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
@@ -77,7 +77,7 @@ vpbroadcastd 16(%rdx),%ymm13 | |||
vpbroadcastd 20(%rdx),%ymm14 | |||
vpbroadcastd 24(%rdx),%ymm15 | |||
butterfly 4,6,8,10,5,7,9,11 12,13,14,15 | |||
butterfly 4,6,8,10,5,7,9,11,12,13,14,15 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
@@ -125,7 +125,7 @@ shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly 3,8,4,9,5,10,6,11 12,12,12,12 | |||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | |||
level5: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
@@ -136,14 +136,14 @@ shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly 7,5,3,10,8,6,4,11 12,12,12,12 | |||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | |||
level6: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
vpmovzxdq 28(%rdx),%ymm12 | |||
vpmovzxdq 44(%rdx),%ymm13 | |||
butterfly 7,5,8,6,3,10,4,11 12,12,13,13 | |||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | |||
level7: | |||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||
@@ -152,7 +152,7 @@ vpmovzxdq 76(%rdx),%ymm13 | |||
vpmovzxdq 92(%rdx),%ymm14 | |||
vpmovzxdq 108(%rdx),%ymm15 | |||
butterfly 7,3,8,4,5,10,6,11 12,13,14,15 | |||
butterfly 7,3,8,4,5,10,6,11,12,13,14,15 | |||
#store | |||
vpsllq $32,%ymm5,%ymm5 | |||
@@ -1,6 +1,6 @@ | |||
.include "shuffle.inc" | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | |||
vpaddd %ymm2,%ymm\l0,%ymm12 | |||
vpaddd %ymm2,%ymm\l1,%ymm13 | |||
vpaddd %ymm2,%ymm\l2,%ymm14 | |||
@@ -121,7 +121,7 @@ level2: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
vpmovzxdq 96(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 3,3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#shuffle | |||
shuffle4 4,5,3,5 | |||
@@ -135,7 +135,7 @@ vpbroadcastd 112(%rdx),%ymm14 | |||
vpbroadcastd 116(%rdx),%ymm15 | |||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | |||
butterfly 3,4,6,8,5,7,9,11 10,10 | |||
butterfly 3,4,6,8,5,7,9,11,10,10 | |||
#shuffle | |||
shuffle8 3,4,10,4 | |||
@@ -147,7 +147,7 @@ level4: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
vpbroadcastd 120(%rdx),%ymm9 | |||
butterfly 10,3,6,5,4,8,7,11 9,9 | |||
butterfly 10,3,6,5,4,8,7,11,9,9 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
@@ -233,7 +233,7 @@ level7: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 3,3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3 | |||
@@ -6,21 +6,31 @@ | |||
#include "nttconsts.h" | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas); | |||
void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas); | |||
void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv); | |||
void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv); | |||
void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
#endif |
@@ -1,6 +1,6 @@ | |||
.include "shuffle.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | |||
#mul | |||
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 | |||
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 | |||
@@ -68,7 +68,7 @@ level1: | |||
vpbroadcastd 4(%rdx),%ymm12 | |||
vpbroadcastd 8(%rdx),%ymm13 | |||
butterfly 4,5,8,9,6,7,10,11 12,12,13,13 | |||
butterfly 4,5,8,9,6,7,10,11,12,12,13,13 | |||
level2: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
@@ -77,7 +77,7 @@ vpbroadcastd 16(%rdx),%ymm13 | |||
vpbroadcastd 20(%rdx),%ymm14 | |||
vpbroadcastd 24(%rdx),%ymm15 | |||
butterfly 4,6,8,10,5,7,9,11 12,13,14,15 | |||
butterfly 4,6,8,10,5,7,9,11,12,13,14,15 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
@@ -125,7 +125,7 @@ shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly 3,8,4,9,5,10,6,11 12,12,12,12 | |||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | |||
level5: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
@@ -136,14 +136,14 @@ shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly 7,5,3,10,8,6,4,11 12,12,12,12 | |||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | |||
level6: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
vpmovzxdq 28(%rdx),%ymm12 | |||
vpmovzxdq 44(%rdx),%ymm13 | |||
butterfly 7,5,8,6,3,10,4,11 12,12,13,13 | |||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | |||
level7: | |||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||
@@ -152,7 +152,7 @@ vpmovzxdq 76(%rdx),%ymm13 | |||
vpmovzxdq 92(%rdx),%ymm14 | |||
vpmovzxdq 108(%rdx),%ymm15 | |||
butterfly 7,3,8,4,5,10,6,11 12,13,14,15 | |||
butterfly 7,3,8,4,5,10,6,11,12,13,14,15 | |||
#store | |||
vpsllq $32,%ymm5,%ymm5 | |||
@@ -1,6 +1,6 @@ | |||
.include "shuffle.inc" | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 | |||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | |||
vpaddd %ymm2,%ymm\l0,%ymm12 | |||
vpaddd %ymm2,%ymm\l1,%ymm13 | |||
vpaddd %ymm2,%ymm\l2,%ymm14 | |||
@@ -121,7 +121,7 @@ level2: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
vpmovzxdq 96(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 3,3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#shuffle | |||
shuffle4 4,5,3,5 | |||
@@ -135,7 +135,7 @@ vpbroadcastd 112(%rdx),%ymm14 | |||
vpbroadcastd 116(%rdx),%ymm15 | |||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | |||
butterfly 3,4,6,8,5,7,9,11 10,10 | |||
butterfly 3,4,6,8,5,7,9,11,10,10 | |||
#shuffle | |||
shuffle8 3,4,10,4 | |||
@@ -147,7 +147,7 @@ level4: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
vpbroadcastd 120(%rdx),%ymm9 | |||
butterfly 10,3,6,5,4,8,7,11 9,9 | |||
butterfly 10,3,6,5,4,8,7,11,9,9 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
@@ -233,7 +233,7 @@ level7: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 3,3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3 | |||
@@ -6,21 +6,31 @@ | |||
#include "nttconsts.h" | |||
#include "params.h" | |||
void PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx(uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas); | |||
void PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx(uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas); | |||
void PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas | |||
); | |||
void PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx(uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv); | |||
void PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx(uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv); | |||
void PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx( | |||
uint64_t *tmp, | |||
const uint32_t *a, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx( | |||
uint32_t *a, | |||
const uint64_t *tmp, | |||
const uint32_t *zetas_inv | |||
); | |||
void PQCLEAN_DILITHIUM4_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM4_AVX2_pointwise_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
void PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx( | |||
uint32_t *c, const uint32_t *a, const uint32_t *b); | |||
#endif |
@@ -1,6 +1,6 @@ | |||
.include "shuffle.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | |||
#mul | |||
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 | |||
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 | |||
@@ -68,7 +68,7 @@ level1: | |||
vpbroadcastd 4(%rdx),%ymm12 | |||
vpbroadcastd 8(%rdx),%ymm13 | |||
butterfly 4,5,8,9,6,7,10,11 12,12,13,13 | |||
butterfly 4,5,8,9,6,7,10,11,12,12,13,13 | |||
level2: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
@@ -77,7 +77,7 @@ vpbroadcastd 16(%rdx),%ymm13 | |||
vpbroadcastd 20(%rdx),%ymm14 | |||
vpbroadcastd 24(%rdx),%ymm15 | |||
butterfly 4,6,8,10,5,7,9,11 12,13,14,15 | |||
butterfly 4,6,8,10,5,7,9,11,12,13,14,15 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
@@ -125,7 +125,7 @@ shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly 3,8,4,9,5,10,6,11 12,12,12,12 | |||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | |||
level5: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
@@ -136,14 +136,14 @@ shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly 7,5,3,10,8,6,4,11 12,12,12,12 | |||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | |||
level6: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
vpmovzxdq 28(%rdx),%ymm12 | |||
vpmovzxdq 44(%rdx),%ymm13 | |||
butterfly 7,5,8,6,3,10,4,11 12,12,13,13 | |||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | |||
level7: | |||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||
@@ -152,7 +152,7 @@ vpmovzxdq 76(%rdx),%ymm13 | |||
vpmovzxdq 92(%rdx),%ymm14 | |||
vpmovzxdq 108(%rdx),%ymm15 | |||
butterfly 7,3,8,4,5,10,6,11 12,13,14,15 | |||
butterfly 7,3,8,4,5,10,6,11,12,13,14,15 | |||
#store | |||
vpsllq $32,%ymm5,%ymm5 | |||