pqc/crypto_sign/dilithium2/avx2/invntt.S
John Schanck 4f86c39515
Round 3 update for Dilithium (from github source) (#369)
* Update Dilithium

* Alternative montgomery reduce to avoid i386 functest errors

* Explicit casts for msvc

* More casts; bump upstream version; fix metadata

* another cast
2021-02-01 13:32:40 +08:00

241 lines
5.7 KiB
ArmAsm

#include "cdecl.h"
.include "shuffle.inc"
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpsubd %ymm\l,%ymm\h,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l
vpmuldq %ymm\zl0,%ymm12,%ymm13
vmovshdup %ymm12,%ymm\h
vpmuldq %ymm\zl1,%ymm\h,%ymm14
vpmuldq %ymm\zh0,%ymm12,%ymm12
vpmuldq %ymm\zh1,%ymm\h,%ymm\h
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpsubd %ymm13,%ymm12,%ymm12
vpsubd %ymm14,%ymm\h,%ymm\h
vmovshdup %ymm12,%ymm12
vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h
.endm
.macro levels0t5 off
vmovdqa 256*\off+ 0(%rdi),%ymm4
vmovdqa 256*\off+ 32(%rdi),%ymm5
vmovdqa 256*\off+ 64(%rdi),%ymm6
vmovdqa 256*\off+ 96(%rdi),%ymm7
vmovdqa 256*\off+128(%rdi),%ymm8
vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11
/* level 0 */
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,5,1,3,2,15
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 6,7,1,3,2,15
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,9,1,3,2,15
vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 10,11,1,3,2,15
/* level 1 */
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,6,1,3,2,15
butterfly 5,7,1,3,2,15
vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 8,10,1,3,2,15
butterfly 9,11,1,3,2,15
/* level 2 */
vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 4,8,1,3,2,15
butterfly 5,9,1,3,2,15
butterfly 6,10,1,3,2,15
butterfly 7,11,1,3,2,15
/* level 3 */
shuffle2 4,5,3,5
shuffle2 6,7,4,7
shuffle2 8,9,6,9
shuffle2 10,11,8,11
vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2
butterfly 3,5
butterfly 4,7
butterfly 6,9
butterfly 8,11
/* level 4 */
shuffle4 3,4,10,4
shuffle4 6,8,3,8
shuffle4 5,7,6,7
shuffle4 9,11,5,11
vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1
vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2
butterfly 10,4
butterfly 3,8
butterfly 6,7
butterfly 5,11
/* level 5 */
shuffle8 10,3,9,3
shuffle8 6,5,10,5
shuffle8 4,8,6,8
shuffle8 7,11,4,11
vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2
butterfly 9,3
butterfly 10,5
butterfly 6,8
butterfly 4,11
vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm10,256*\off+ 32(%rdi)
vmovdqa %ymm6,256*\off+ 64(%rdi)
vmovdqa %ymm4,256*\off+ 96(%rdi)
vmovdqa %ymm3,256*\off+128(%rdi)
vmovdqa %ymm5,256*\off+160(%rdi)
vmovdqa %ymm8,256*\off+192(%rdi)
vmovdqa %ymm11,256*\off+224(%rdi)
.endm
.macro levels6t7 off
vmovdqa 0+32*\off(%rdi),%ymm4
vmovdqa 128+32*\off(%rdi),%ymm5
vmovdqa 256+32*\off(%rdi),%ymm6
vmovdqa 384+32*\off(%rdi),%ymm7
vmovdqa 512+32*\off(%rdi),%ymm8
vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11
/* level 6 */
vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2
butterfly 4,6
butterfly 5,7
vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11
/* level 7 */
vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1
vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2
butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11
vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)
vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1
vmovdqa (_8XDIV)*4(%rsi),%ymm2
vpmuldq %ymm1,%ymm4,%ymm12
vpmuldq %ymm1,%ymm5,%ymm13
vmovshdup %ymm4,%ymm8
vmovshdup %ymm5,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm4,%ymm4
vpmuldq %ymm2,%ymm5,%ymm5
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm4,%ymm4
vpsubd %ymm13,%ymm5,%ymm5
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm4,%ymm4
vmovshdup %ymm5,%ymm5
vpblendd $0xAA,%ymm8,%ymm4,%ymm4
vpblendd $0xAA,%ymm9,%ymm5,%ymm5
vpmuldq %ymm1,%ymm6,%ymm12
vpmuldq %ymm1,%ymm7,%ymm13
vmovshdup %ymm6,%ymm8
vmovshdup %ymm7,%ymm9
vpmuldq %ymm1,%ymm8,%ymm14
vpmuldq %ymm1,%ymm9,%ymm15
vpmuldq %ymm2,%ymm6,%ymm6
vpmuldq %ymm2,%ymm7,%ymm7
vpmuldq %ymm2,%ymm8,%ymm8
vpmuldq %ymm2,%ymm9,%ymm9
vpmuldq %ymm0,%ymm12,%ymm12
vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14
vpmuldq %ymm0,%ymm15,%ymm15
vpsubd %ymm12,%ymm6,%ymm6
vpsubd %ymm13,%ymm7,%ymm7
vpsubd %ymm14,%ymm8,%ymm8
vpsubd %ymm15,%ymm9,%ymm9
vmovshdup %ymm6,%ymm6
vmovshdup %ymm7,%ymm7
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
vpblendd $0xAA,%ymm9,%ymm7,%ymm7
vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
vmovdqa %ymm7,384+32*\off(%rdi)
.endm
.text
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx):
_cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx):
vmovdqa _8XQ*4(%rsi),%ymm0
levels0t5 0
levels0t5 1
levels0t5 2
levels0t5 3
levels6t7 0
levels6t7 1
levels6t7 2
levels6t7 3
ret