#include "params.h" #include "cdecl.inc" .macro schoolbook off,sign #load vmovdqa \off+32(%rsi),%ymm7 # b vmovdqa \off+32(%rdx),%ymm8 # d vmovdqa \off(%rsi),%ymm9 # a vmovdqa \off(%rdx),%ymm10 # c #mul vpmullw %ymm7,%ymm8,%ymm11 # bd.lo vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi vpmullw %ymm7,%ymm10,%ymm13 # bc.lo vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi vpmullw %ymm9,%ymm8,%ymm14 # ad.lo vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi vpmullw %ymm9,%ymm10,%ymm15 # ac.lo vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi #reduce vpmullw %ymm1,%ymm11,%ymm11 vpmulhw %ymm0,%ymm11,%ymm11 vpsubw %ymm11,%ymm12,%ymm11 # bd #mul vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi #unpack vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 #add .ifeq \sign vpaddd %ymm14,%ymm15,%ymm14 # x0 vpaddd %ymm9,%ymm10,%ymm9 # x1 .else vpsubd %ymm15,%ymm14,%ymm14 # x0 vpsubd %ymm10,%ymm9,%ymm9 # x1 .endif vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm .macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z vpsrld $16,%ymm\a0,%ymm\a0 vpsrld $16,%ymm\a1,%ymm\a1 vpackusdw %ymm\z,%ymm\y,%ymm\z vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x vpsrld $16,%ymm\b0,%ymm\b0 vpsrld $16,%ymm\b1,%ymm\b1 vpackusdw %ymm\x,%ymm\y,%ymm\y vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 #reduce vpmullw %ymm1,%ymm\z,%ymm\z vpmullw %ymm1,%ymm\y,%ymm\y vpmulhw %ymm0,%ymm\z,%ymm\z vpmulhw %ymm0,%ymm\y,%ymm\y vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm .text basemul64_acc_avx: poly0.0: schoolbook 0,0 #mov vmovdqa %ymm14,%ymm3 vmovdqa %ymm9,%ymm4 vmovdqa %ymm12,%ymm5 vmovdqa %ymm7,%ymm6 poly1.0: schoolbook 512,0 #add vpaddd %ymm14,%ymm3,%ymm3 vpaddd %ymm9,%ymm4,%ymm4 vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 poly2.0: schoolbook 1024,0 #add vpaddd %ymm14,%ymm3,%ymm3 vpaddd %ymm9,%ymm4,%ymm4 vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 #reduce red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) vmovdqa %ymm5,32(%rdi) poly0.1: schoolbook 64,1 #mov vmovdqa %ymm14,%ymm3 vmovdqa %ymm9,%ymm4 vmovdqa %ymm12,%ymm5 vmovdqa %ymm7,%ymm6 poly1.1: schoolbook 576,1 #add vpaddd %ymm14,%ymm3,%ymm3 vpaddd %ymm9,%ymm4,%ymm4 vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 poly2.1: schoolbook 1088,1 #add vpaddd %ymm14,%ymm3,%ymm3 vpaddd %ymm9,%ymm4,%ymm4 vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 #reduce red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) vmovdqa %ymm5,96(%rdi) ret .global cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx) cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx): #consts vmovdqa _16XQ*2(%rcx),%ymm0 vmovdqa _16XQINV*2(%rcx),%ymm1 vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 call basemul64_acc_avx vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 add $128,%rdi add $128,%rsi add $128,%rdx call basemul64_acc_avx vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 add $128,%rdi add $128,%rsi add $128,%rdx call basemul64_acc_avx vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 add $128,%rdi add $128,%rsi add $128,%rdx call basemul64_acc_avx ret basemul64_avx: schoolbook 0,0 #reduce red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret .global cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx): #consts vmovdqa _16XQ*2(%rcx),%ymm0 vmovdqa _16XQINV*2(%rcx),%ymm1 vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 call basemul64_avx vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 add $128,%rdi add $128,%rsi add $128,%rdx call basemul64_avx vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 add $128,%rdi add $128,%rsi add $128,%rdx call basemul64_avx vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 add $128,%rdi add $128,%rsi add $128,%rdx call basemul64_avx ret