#include "cdecl.h" .include "fq.inc" .text reduce128_avx: #load vmovdqa (%rdi),%ymm2 vmovdqa 32(%rdi),%ymm3 vmovdqa 64(%rdi),%ymm4 vmovdqa 96(%rdi),%ymm5 vmovdqa 128(%rdi),%ymm6 vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 red16 2,10 red16 3,11 red16 4,12 red16 5,13 red16 6,14 red16 7,15 red16 8,10 red16 9,11 #store vmovdqa %ymm2,(%rdi) vmovdqa %ymm3,32(%rdi) vmovdqa %ymm4,64(%rdi) vmovdqa %ymm5,96(%rdi) vmovdqa %ymm6,128(%rdi) vmovdqa %ymm7,160(%rdi) vmovdqa %ymm8,192(%rdi) vmovdqa %ymm9,224(%rdi) ret .global cdecl(PQCLEAN_KYBER51290S_AVX2_reduce_avx) .global _cdecl(PQCLEAN_KYBER51290S_AVX2_reduce_avx) cdecl(PQCLEAN_KYBER51290S_AVX2_reduce_avx): _cdecl(PQCLEAN_KYBER51290S_AVX2_reduce_avx): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XV*2(%rsi),%ymm1 call reduce128_avx add $256,%rdi call reduce128_avx ret csubq128_avx: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm2 vmovdqa 64(%rdi),%ymm3 vmovdqa 96(%rdi),%ymm4 vmovdqa 128(%rdi),%ymm5 vmovdqa 160(%rdi),%ymm6 vmovdqa 192(%rdi),%ymm7 vmovdqa 224(%rdi),%ymm8 csubq 1,9 csubq 2,10 csubq 3,11 csubq 4,12 csubq 5,13 csubq 6,14 csubq 7,15 csubq 8,9 #store vmovdqa %ymm1,(%rdi) vmovdqa %ymm2,32(%rdi) vmovdqa %ymm3,64(%rdi) vmovdqa %ymm4,96(%rdi) vmovdqa %ymm5,128(%rdi) vmovdqa %ymm6,160(%rdi) vmovdqa %ymm7,192(%rdi) vmovdqa %ymm8,224(%rdi) ret .global cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx) .global _cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx) cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx): _cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx): #consts vmovdqa _16XQ*2(%rsi),%ymm0 call csubq128_avx add $256,%rdi call csubq128_avx ret tomont128_avx: #load vmovdqa (%rdi),%ymm3 vmovdqa 32(%rdi),%ymm4 vmovdqa 64(%rdi),%ymm5 vmovdqa 96(%rdi),%ymm6 vmovdqa 128(%rdi),%ymm7 vmovdqa 160(%rdi),%ymm8 vmovdqa 192(%rdi),%ymm9 vmovdqa 224(%rdi),%ymm10 fqmulprecomp 1,2,3,11 fqmulprecomp 1,2,4,12 fqmulprecomp 1,2,5,13 fqmulprecomp 1,2,6,14 fqmulprecomp 1,2,7,15 fqmulprecomp 1,2,8,11 fqmulprecomp 1,2,9,12 fqmulprecomp 1,2,10,13 #store vmovdqa %ymm3,(%rdi) vmovdqa %ymm4,32(%rdi) vmovdqa %ymm5,64(%rdi) vmovdqa %ymm6,96(%rdi) vmovdqa %ymm7,128(%rdi) vmovdqa %ymm8,160(%rdi) vmovdqa %ymm9,192(%rdi) vmovdqa %ymm10,224(%rdi) ret .global cdecl(PQCLEAN_KYBER51290S_AVX2_tomont_avx) .global _cdecl(PQCLEAN_KYBER51290S_AVX2_tomont_avx) cdecl(PQCLEAN_KYBER51290S_AVX2_tomont_avx): _cdecl(PQCLEAN_KYBER51290S_AVX2_tomont_avx): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 call tomont128_avx add $256,%rdi call tomont128_avx ret