#include "cdecl.inc" .global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): #consts vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm3 vmovdqa 64(%rdi),%ymm5 vmovdqa 96(%rdi),%ymm7 #reduce vpsrld $23,%ymm1,%ymm2 vpsrld $23,%ymm3,%ymm4 vpsrld $23,%ymm5,%ymm6 vpsrld $23,%ymm7,%ymm8 vpand %ymm0,%ymm1,%ymm1 vpand %ymm0,%ymm3,%ymm3 vpand %ymm0,%ymm5,%ymm5 vpand %ymm0,%ymm7,%ymm7 vpsubd %ymm2,%ymm1,%ymm1 vpsubd %ymm4,%ymm3,%ymm3 vpsubd %ymm6,%ymm5,%ymm5 vpsubd %ymm8,%ymm7,%ymm7 vpslld $13,%ymm2,%ymm2 vpslld $13,%ymm4,%ymm4 vpslld $13,%ymm6,%ymm6 vpslld $13,%ymm8,%ymm8 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm4,%ymm3,%ymm3 vpaddd %ymm6,%ymm5,%ymm5 vpaddd %ymm8,%ymm7,%ymm7 #store vmovdqa %ymm1,(%rdi) vmovdqa %ymm3,32(%rdi) vmovdqa %ymm5,64(%rdi) vmovdqa %ymm7,96(%rdi) add $128,%rdi add $1,%eax cmp $8,%eax jb _looptop_rdc32 ret .global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): #consts vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm3 vmovdqa 64(%rdi),%ymm5 vmovdqa 96(%rdi),%ymm7 #cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq) vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm5,%ymm5 vpsubd %ymm0,%ymm7,%ymm7 vpsrad $31,%ymm1,%ymm2 vpsrad $31,%ymm3,%ymm4 vpsrad $31,%ymm5,%ymm6 vpsrad $31,%ymm7,%ymm8 vpand %ymm0,%ymm2,%ymm2 vpand %ymm0,%ymm4,%ymm4 vpand %ymm0,%ymm6,%ymm6 vpand %ymm0,%ymm8,%ymm8 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm4,%ymm3,%ymm3 vpaddd %ymm6,%ymm5,%ymm5 vpaddd %ymm8,%ymm7,%ymm7 #store vmovdqa %ymm1,(%rdi) vmovdqa %ymm3,32(%rdi) vmovdqa %ymm5,64(%rdi) vmovdqa %ymm7,96(%rdi) add $128,%rdi add $1,%eax cmp $8,%eax jb _looptop_csubq ret