93 lines
1.8 KiB
ArmAsm
93 lines
1.8 KiB
ArmAsm
#include "cdecl.h"
|
|
.include "fq.inc"
|
|
|
|
.text
|
|
reduce128_avx:
|
|
#load
|
|
vmovdqa (%rdi),%ymm2
|
|
vmovdqa 32(%rdi),%ymm3
|
|
vmovdqa 64(%rdi),%ymm4
|
|
vmovdqa 96(%rdi),%ymm5
|
|
vmovdqa 128(%rdi),%ymm6
|
|
vmovdqa 160(%rdi),%ymm7
|
|
vmovdqa 192(%rdi),%ymm8
|
|
vmovdqa 224(%rdi),%ymm9
|
|
|
|
red16 2
|
|
red16 3
|
|
red16 4
|
|
red16 5
|
|
red16 6
|
|
red16 7
|
|
red16 8
|
|
red16 9
|
|
|
|
#store
|
|
vmovdqa %ymm2,(%rdi)
|
|
vmovdqa %ymm3,32(%rdi)
|
|
vmovdqa %ymm4,64(%rdi)
|
|
vmovdqa %ymm5,96(%rdi)
|
|
vmovdqa %ymm6,128(%rdi)
|
|
vmovdqa %ymm7,160(%rdi)
|
|
vmovdqa %ymm8,192(%rdi)
|
|
vmovdqa %ymm9,224(%rdi)
|
|
|
|
ret
|
|
|
|
.global cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx)
|
|
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx)
|
|
cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx):
|
|
_cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx):
|
|
#consts
|
|
vmovdqa _16XQ*2(%rsi),%ymm0
|
|
vmovdqa _16XV*2(%rsi),%ymm1
|
|
call reduce128_avx
|
|
add $256,%rdi
|
|
call reduce128_avx
|
|
ret
|
|
|
|
tomont128_avx:
|
|
#load
|
|
vmovdqa (%rdi),%ymm3
|
|
vmovdqa 32(%rdi),%ymm4
|
|
vmovdqa 64(%rdi),%ymm5
|
|
vmovdqa 96(%rdi),%ymm6
|
|
vmovdqa 128(%rdi),%ymm7
|
|
vmovdqa 160(%rdi),%ymm8
|
|
vmovdqa 192(%rdi),%ymm9
|
|
vmovdqa 224(%rdi),%ymm10
|
|
|
|
fqmulprecomp 1,2,3,11
|
|
fqmulprecomp 1,2,4,12
|
|
fqmulprecomp 1,2,5,13
|
|
fqmulprecomp 1,2,6,14
|
|
fqmulprecomp 1,2,7,15
|
|
fqmulprecomp 1,2,8,11
|
|
fqmulprecomp 1,2,9,12
|
|
fqmulprecomp 1,2,10,13
|
|
|
|
#store
|
|
vmovdqa %ymm3,(%rdi)
|
|
vmovdqa %ymm4,32(%rdi)
|
|
vmovdqa %ymm5,64(%rdi)
|
|
vmovdqa %ymm6,96(%rdi)
|
|
vmovdqa %ymm7,128(%rdi)
|
|
vmovdqa %ymm8,160(%rdi)
|
|
vmovdqa %ymm9,192(%rdi)
|
|
vmovdqa %ymm10,224(%rdi)
|
|
|
|
ret
|
|
|
|
.global cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx)
|
|
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx)
|
|
cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx):
|
|
_cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx):
|
|
#consts
|
|
vmovdqa _16XQ*2(%rsi),%ymm0
|
|
vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
|
|
vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
|
|
call tomont128_avx
|
|
add $256,%rdi
|
|
call tomont128_avx
|
|
ret
|