#include "cdecl.h" .include "shuffle.inc" .text nttunpack128_avx: #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 vmovdqa 64(%rdi),%ymm6 vmovdqa 96(%rdi),%ymm7 vmovdqa 128(%rdi),%ymm8 vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 shuffle4 3,5,7,5 shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 shuffle2 7,8,9,8 shuffle2 5,6,7,6 shuffle2 3,4,5,4 shuffle2 10,11,3,11 #store vmovdqa %ymm9,(%rdi) vmovdqa %ymm8,32(%rdi) vmovdqa %ymm7,64(%rdi) vmovdqa %ymm6,96(%rdi) vmovdqa %ymm5,128(%rdi) vmovdqa %ymm4,160(%rdi) vmovdqa %ymm3,192(%rdi) vmovdqa %ymm11,224(%rdi) ret .global cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx) .global _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx) cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx): _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx): call nttunpack128_avx add $256,%rdi call nttunpack128_avx add $256,%rdi call nttunpack128_avx add $256,%rdi call nttunpack128_avx ret