mirror of
https://github.com/henrydcase/pqc.git
synced 2024-11-27 09:51:30 +00:00
c0f56ccdc2
Makes Kyber-AVX run on MacOS (#251)
229 lines
4.1 KiB
ArmAsm
229 lines
4.1 KiB
ArmAsm
#include "params.h"
|
|
#include "cdecl.inc"
|
|
|
|
.macro schoolbook off,sign
|
|
#load
|
|
vmovdqa \off+32(%rsi),%ymm7 # b
|
|
vmovdqa \off+32(%rdx),%ymm8 # d
|
|
vmovdqa \off(%rsi),%ymm9 # a
|
|
vmovdqa \off(%rdx),%ymm10 # c
|
|
|
|
#mul
|
|
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo
|
|
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi
|
|
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo
|
|
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi
|
|
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo
|
|
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi
|
|
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo
|
|
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi
|
|
|
|
#reduce
|
|
vpmullw %ymm1,%ymm11,%ymm11
|
|
vpmulhw %ymm0,%ymm11,%ymm11
|
|
vpsubw %ymm11,%ymm12,%ymm11 # bd
|
|
|
|
#mul
|
|
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo
|
|
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi
|
|
|
|
#unpack
|
|
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0
|
|
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1
|
|
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0
|
|
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1
|
|
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0
|
|
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1
|
|
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0
|
|
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1
|
|
|
|
#add
|
|
.ifeq \sign
|
|
vpaddd %ymm14,%ymm15,%ymm14 # x0
|
|
vpaddd %ymm9,%ymm10,%ymm9 # x1
|
|
.else
|
|
vpsubd %ymm15,%ymm14,%ymm14 # x0
|
|
vpsubd %ymm10,%ymm9,%ymm9 # x1
|
|
.endif
|
|
vpaddd %ymm12,%ymm13,%ymm12 # y0
|
|
vpaddd %ymm7,%ymm8,%ymm7 # y1
|
|
.endm
|
|
|
|
.macro red a0,a1,b0,b1,x,y,z
|
|
#pack
|
|
vpxor %ymm\x,%ymm\x,%ymm\x
|
|
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
|
|
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z
|
|
vpsrld $16,%ymm\a0,%ymm\a0
|
|
vpsrld $16,%ymm\a1,%ymm\a1
|
|
vpackusdw %ymm\z,%ymm\y,%ymm\z
|
|
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0
|
|
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y
|
|
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x
|
|
vpsrld $16,%ymm\b0,%ymm\b0
|
|
vpsrld $16,%ymm\b1,%ymm\b1
|
|
vpackusdw %ymm\x,%ymm\y,%ymm\y
|
|
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0
|
|
|
|
#reduce
|
|
vpmullw %ymm1,%ymm\z,%ymm\z
|
|
vpmullw %ymm1,%ymm\y,%ymm\y
|
|
vpmulhw %ymm0,%ymm\z,%ymm\z
|
|
vpmulhw %ymm0,%ymm\y,%ymm\y
|
|
vpsubw %ymm\z,%ymm\a0,%ymm\a0
|
|
vpsubw %ymm\y,%ymm\b0,%ymm\b0
|
|
.endm
|
|
|
|
.text
|
|
basemul64_acc_avx:
|
|
poly0.0:
|
|
schoolbook 0,0
|
|
|
|
#mov
|
|
vmovdqa %ymm14,%ymm3
|
|
vmovdqa %ymm9,%ymm4
|
|
vmovdqa %ymm12,%ymm5
|
|
vmovdqa %ymm7,%ymm6
|
|
|
|
poly1.0:
|
|
schoolbook 512,0
|
|
|
|
#add
|
|
vpaddd %ymm14,%ymm3,%ymm3
|
|
vpaddd %ymm9,%ymm4,%ymm4
|
|
vpaddd %ymm12,%ymm5,%ymm5
|
|
vpaddd %ymm7,%ymm6,%ymm6
|
|
|
|
poly2.0:
|
|
schoolbook 1024,0
|
|
|
|
#add
|
|
vpaddd %ymm14,%ymm3,%ymm3
|
|
vpaddd %ymm9,%ymm4,%ymm4
|
|
vpaddd %ymm12,%ymm5,%ymm5
|
|
vpaddd %ymm7,%ymm6,%ymm6
|
|
|
|
|
|
#reduce
|
|
red 3,4,5,6,7,8,9
|
|
|
|
#store
|
|
vmovdqa %ymm3,(%rdi)
|
|
vmovdqa %ymm5,32(%rdi)
|
|
|
|
poly0.1:
|
|
schoolbook 64,1
|
|
|
|
#mov
|
|
vmovdqa %ymm14,%ymm3
|
|
vmovdqa %ymm9,%ymm4
|
|
vmovdqa %ymm12,%ymm5
|
|
vmovdqa %ymm7,%ymm6
|
|
|
|
poly1.1:
|
|
schoolbook 576,1
|
|
|
|
#add
|
|
vpaddd %ymm14,%ymm3,%ymm3
|
|
vpaddd %ymm9,%ymm4,%ymm4
|
|
vpaddd %ymm12,%ymm5,%ymm5
|
|
vpaddd %ymm7,%ymm6,%ymm6
|
|
|
|
poly2.1:
|
|
schoolbook 1088,1
|
|
|
|
#add
|
|
vpaddd %ymm14,%ymm3,%ymm3
|
|
vpaddd %ymm9,%ymm4,%ymm4
|
|
vpaddd %ymm12,%ymm5,%ymm5
|
|
vpaddd %ymm7,%ymm6,%ymm6
|
|
|
|
|
|
#reduce
|
|
red 3,4,5,6,7,8,9
|
|
|
|
#store
|
|
vmovdqa %ymm3,64(%rdi)
|
|
vmovdqa %ymm5,96(%rdi)
|
|
|
|
ret
|
|
|
|
.global cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx)
|
|
cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx):
|
|
#consts
|
|
vmovdqa _16XQ*2(%rcx),%ymm0
|
|
vmovdqa _16XQINV*2(%rcx),%ymm1
|
|
|
|
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
|
|
call basemul64_acc_avx
|
|
|
|
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
|
|
add $128,%rdi
|
|
add $128,%rsi
|
|
add $128,%rdx
|
|
call basemul64_acc_avx
|
|
|
|
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
|
|
add $128,%rdi
|
|
add $128,%rsi
|
|
add $128,%rdx
|
|
call basemul64_acc_avx
|
|
|
|
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
|
|
add $128,%rdi
|
|
add $128,%rsi
|
|
add $128,%rdx
|
|
call basemul64_acc_avx
|
|
|
|
ret
|
|
|
|
basemul64_avx:
|
|
schoolbook 0,0
|
|
|
|
#reduce
|
|
red 14,9,12,7,8,10,11
|
|
|
|
#store
|
|
vmovdqa %ymm14,(%rdi)
|
|
vmovdqa %ymm12,32(%rdi)
|
|
|
|
schoolbook 64,1
|
|
|
|
#reduce
|
|
red 14,9,12,7,8,10,11
|
|
|
|
#store
|
|
vmovdqa %ymm14,64(%rdi)
|
|
vmovdqa %ymm12,96(%rdi)
|
|
|
|
ret
|
|
|
|
.global cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx)
|
|
cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx):
|
|
#consts
|
|
vmovdqa _16XQ*2(%rcx),%ymm0
|
|
vmovdqa _16XQINV*2(%rcx),%ymm1
|
|
|
|
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
|
|
call basemul64_avx
|
|
|
|
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
|
|
add $128,%rdi
|
|
add $128,%rsi
|
|
add $128,%rdx
|
|
call basemul64_avx
|
|
|
|
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
|
|
add $128,%rdi
|
|
add $128,%rsi
|
|
add $128,%rdx
|
|
call basemul64_avx
|
|
|
|
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
|
|
add $128,%rdi
|
|
add $128,%rsi
|
|
add $128,%rdx
|
|
call basemul64_avx
|
|
|
|
ret
|