4f86c39515
* Update Dilithium * Alternative montgomery reduce to avoid i386 functest errors * Explicit casts for msvc * More casts; bump upstream version; fix metadata * another cast
55 lines
1.0 KiB
ArmAsm
55 lines
1.0 KiB
ArmAsm
#include "cdecl.h"
|
|
.include "shuffle.inc"
|
|
|
|
.text
|
|
nttunpack128_avx:
|
|
#load
|
|
vmovdqa (%rdi),%ymm4
|
|
vmovdqa 32(%rdi),%ymm5
|
|
vmovdqa 64(%rdi),%ymm6
|
|
vmovdqa 96(%rdi),%ymm7
|
|
vmovdqa 128(%rdi),%ymm8
|
|
vmovdqa 160(%rdi),%ymm9
|
|
vmovdqa 192(%rdi),%ymm10
|
|
vmovdqa 224(%rdi),%ymm11
|
|
|
|
shuffle8 4,8,3,8
|
|
shuffle8 5,9,4,9
|
|
shuffle8 6,10,5,10
|
|
shuffle8 7,11,6,11
|
|
|
|
shuffle4 3,5,7,5
|
|
shuffle4 8,10,3,10
|
|
shuffle4 4,6,8,6
|
|
shuffle4 9,11,4,11
|
|
|
|
shuffle2 7,8,9,8
|
|
shuffle2 5,6,7,6
|
|
shuffle2 3,4,5,4
|
|
shuffle2 10,11,3,11
|
|
|
|
#store
|
|
vmovdqa %ymm9,(%rdi)
|
|
vmovdqa %ymm8,32(%rdi)
|
|
vmovdqa %ymm7,64(%rdi)
|
|
vmovdqa %ymm6,96(%rdi)
|
|
vmovdqa %ymm5,128(%rdi)
|
|
vmovdqa %ymm4,160(%rdi)
|
|
vmovdqa %ymm3,192(%rdi)
|
|
vmovdqa %ymm11,224(%rdi)
|
|
|
|
ret
|
|
|
|
.global cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx)
|
|
.global _cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx)
|
|
cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx):
|
|
_cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx):
|
|
call nttunpack128_avx
|
|
add $256,%rdi
|
|
call nttunpack128_avx
|
|
add $256,%rdi
|
|
call nttunpack128_avx
|
|
add $256,%rdi
|
|
call nttunpack128_avx
|
|
ret
|