pqc/crypto_sign/dilithium3/avx2/reduce.S

94 lines
1.8 KiB
ArmAsm
Raw Normal View History

2019-12-11 14:06:02 +00:00
#include "cdecl.inc"
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
2019-12-06 15:16:41 +00:00
#consts
2019-12-11 14:06:02 +00:00
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
2019-12-06 15:16:41 +00:00
xor %eax,%eax
_looptop_rdc32:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7
#reduce
vpsrld $23,%ymm1,%ymm2
vpsrld $23,%ymm3,%ymm4
vpsrld $23,%ymm5,%ymm6
vpsrld $23,%ymm7,%ymm8
vpand %ymm0,%ymm1,%ymm1
vpand %ymm0,%ymm3,%ymm3
vpand %ymm0,%ymm5,%ymm5
vpand %ymm0,%ymm7,%ymm7
vpsubd %ymm2,%ymm1,%ymm1
vpsubd %ymm4,%ymm3,%ymm3
vpsubd %ymm6,%ymm5,%ymm5
vpsubd %ymm8,%ymm7,%ymm7
vpslld $13,%ymm2,%ymm2
vpslld $13,%ymm4,%ymm4
vpslld $13,%ymm6,%ymm6
vpslld $13,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7
#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)
add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_rdc32
ret
2019-12-11 14:06:02 +00:00
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
2019-12-06 15:16:41 +00:00
#consts
2019-12-11 14:06:02 +00:00
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
2019-12-06 15:16:41 +00:00
xor %eax,%eax
_looptop_csubq:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7
2019-12-11 14:06:02 +00:00
#cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq)
2019-12-06 15:16:41 +00:00
vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5
vpsubd %ymm0,%ymm7,%ymm7
vpsrad $31,%ymm1,%ymm2
vpsrad $31,%ymm3,%ymm4
vpsrad $31,%ymm5,%ymm6
vpsrad $31,%ymm7,%ymm8
vpand %ymm0,%ymm2,%ymm2
vpand %ymm0,%ymm4,%ymm4
vpand %ymm0,%ymm6,%ymm6
vpand %ymm0,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7
#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)
add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_csubq
ret