pqc/crypto_sign/dilithium2/avx2/ntt.S
John Schanck 4f86c39515
Round 3 update for Dilithium (from github source) (#369)
* Update Dilithium

* Alternative montgomery reduce to avoid i386 functest errors

* Explicit casts for msvc

* More casts; bump upstream version; fix metadata

* another cast
2021-02-01 13:32:40 +08:00

200 lines
4.3 KiB
ArmAsm

#include "cdecl.h"
.include "shuffle.inc"
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpmuldq %ymm\zl0,%ymm\h,%ymm13
vmovshdup %ymm\h,%ymm12
vpmuldq %ymm\zl1,%ymm12,%ymm14
vpmuldq %ymm\zh0,%ymm\h,%ymm\h
vpmuldq %ymm\zh1,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vmovshdup %ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h
vpsubd %ymm\h,%ymm\l,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l
vmovshdup %ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13
vpaddd %ymm13,%ymm12,%ymm\h
vpsubd %ymm13,%ymm\l,%ymm\l
.endm
.macro levels0t1 off
/* level 0 */
vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2
vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11
butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11
/* level 1 */
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11
vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)
.endm
.macro levels2t7 off
/* level 2 */
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11
vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2
butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11
shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11
/* level 3 */
vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2
butterfly 3,5
butterfly 8,10
butterfly 4,6
butterfly 9,11
shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11
/* level 4 */
vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2
butterfly 7,8
butterfly 5,6
butterfly 3,4
butterfly 10,11
shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11
/* level 5 */
vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,5,1,10,2,15
butterfly 8,4,1,10,2,15
butterfly 7,3,1,10,2,15
butterfly 6,11,1,10,2,15
/* level 6 */
vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,7,1,10,2,15
butterfly 8,6,1,10,2,15
vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,3,1,10,2,15
butterfly 4,11,1,10,2,15
/* level 7 */
vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 9,8,1,10,2,15
vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 7,6,1,10,2,15
vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 5,4,1,10,2,15
vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1
vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2
vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 3,11,1,10,2,15
vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm8,256*\off+ 32(%rdi)
vmovdqa %ymm7,256*\off+ 64(%rdi)
vmovdqa %ymm6,256*\off+ 96(%rdi)
vmovdqa %ymm5,256*\off+128(%rdi)
vmovdqa %ymm4,256*\off+160(%rdi)
vmovdqa %ymm3,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm
.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0
levels0t1 0
levels0t1 1
levels0t1 2
levels0t1 3
levels2t7 0
levels2t7 1
levels2t7 2
levels2t7 3
ret