1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-26 17:31:38 +00:00
pqcrypto/crypto_sign/dilithium4/avx2/ntt.S
2021-03-24 21:02:45 +00:00

180 lines
3.8 KiB
ArmAsm

.include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
#mul
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1
vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2
vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3
#reduce
vpmuludq %ymm0,%ymm\rh0,%ymm12
vpmuludq %ymm0,%ymm\rh1,%ymm13
vpmuludq %ymm0,%ymm\rh2,%ymm14
vpmuludq %ymm0,%ymm\rh3,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm\rh0,%ymm12,%ymm12
vpaddq %ymm\rh1,%ymm13,%ymm13
vpaddq %ymm\rh2,%ymm14,%ymm14
vpaddq %ymm\rh3,%ymm15,%ymm15
vpsrlq $32,%ymm12,%ymm12
vpsrlq $32,%ymm13,%ymm13
vpsrlq $32,%ymm14,%ymm14
vpsrlq $32,%ymm15,%ymm15
#update
vpaddd %ymm2,%ymm\rl0,%ymm\rh0
vpaddd %ymm2,%ymm\rl1,%ymm\rh1
vpaddd %ymm2,%ymm\rl2,%ymm\rh2
vpaddd %ymm2,%ymm\rl3,%ymm\rh3
vpaddd %ymm12,%ymm\rl0,%ymm\rl0
vpaddd %ymm13,%ymm\rl1,%ymm\rl1
vpaddd %ymm14,%ymm\rl2,%ymm\rl2
vpaddd %ymm15,%ymm\rl3,%ymm\rl3
vpsubd %ymm12,%ymm\rh0,%ymm\rh0
vpsubd %ymm13,%ymm\rh1,%ymm\rh1
vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
level0:
#zetas
vpbroadcastd (%rdx),%ymm3
#load
vpmovzxdq (%rsi),%ymm4
vpmovzxdq 128(%rsi),%ymm5
vpmovzxdq 256(%rsi),%ymm6
vpmovzxdq 384(%rsi),%ymm7
vpmovzxdq 512(%rsi),%ymm8
vpmovzxdq 640(%rsi),%ymm9
vpmovzxdq 768(%rsi),%ymm10
vpmovzxdq 896(%rsi),%ymm11
butterfly 4,5,6,7,8,9,10,11
level1:
#PQCLEAN_DILITHIUM4_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13
butterfly 4,5,8,9,6,7,10,11,12,12,13,13
level2:
#PQCLEAN_DILITHIUM4_AVX2_zetas
vpbroadcastd 12(%rdx),%ymm12
vpbroadcastd 16(%rdx),%ymm13
vpbroadcastd 20(%rdx),%ymm14
vpbroadcastd 24(%rdx),%ymm15
butterfly 4,6,8,10,5,7,9,11,12,13,14,15
#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,256(%rdi)
vmovdqa %ymm6,512(%rdi)
vmovdqa %ymm7,768(%rdi)
vmovdqa %ymm8,1024(%rdi)
vmovdqa %ymm9,1280(%rdi)
vmovdqa %ymm10,1536(%rdi)
vmovdqa %ymm11,1792(%rdi)
ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4
vmovdqa 32(%rsi),%ymm5
vmovdqa 64(%rsi),%ymm6
vmovdqa 96(%rsi),%ymm7
vmovdqa 128(%rsi),%ymm8
vmovdqa 160(%rsi),%ymm9
vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11
level3:
#zetas
vpbroadcastd (%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11
level4:
#PQCLEAN_DILITHIUM4_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13
vpblendd $0xF0,%ymm13,%ymm12,%ymm12
shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
level5:
#zetas
vpmovzxdq 12(%rdx),%ymm12
shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
level6:
#zetas
vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
level7:
#zetas
vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14
vpmovzxdq 108(%rdx),%ymm15
butterfly 7,3,8,4,5,10,6,11,12,13,14,15
#store
vpsllq $32,%ymm5,%ymm5
vpsllq $32,%ymm10,%ymm10
vpsllq $32,%ymm6,%ymm6
vpsllq $32,%ymm11,%ymm11
vpblendd $0xAA,%ymm5,%ymm7,%ymm7
vpblendd $0xAA,%ymm10,%ymm3,%ymm3
vpblendd $0xAA,%ymm6,%ymm8,%ymm8
vpblendd $0xAA,%ymm11,%ymm4,%ymm4
shuffle4 7,3,5,3
shuffle4 8,4,7,4
shuffle8 5,7,6,7
shuffle8 3,4,5,4
vmovdqa %ymm6,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm4,96(%rdi)
ret