mirror of
https://github.com/henrydcase/pqc.git
synced 2024-11-22 23:48:58 +00:00
Add MacOS support for Dilithium
This commit is contained in:
parent
ebb416a2ba
commit
44b0522070
@ -23,6 +23,7 @@ implementations:
|
|||||||
supported_platforms:
|
supported_platforms:
|
||||||
- architecture: x86_64
|
- architecture: x86_64
|
||||||
operating_systems:
|
operating_systems:
|
||||||
|
- Darwin
|
||||||
- Linux
|
- Linux
|
||||||
required_flags:
|
required_flags:
|
||||||
- avx2
|
- avx2
|
||||||
|
@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)
|
|||||||
%.o: %.c $(HEADERS)
|
%.o: %.c $(HEADERS)
|
||||||
$(CC) $(CFLAGS) -c -o $@ $<
|
$(CC) $(CFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
%.o: %.s $(HEADERS)
|
|
||||||
$(AS) -o $@ $<
|
|
||||||
|
|
||||||
%.o: %.S $(HEADERS)
|
%.o: %.S $(HEADERS)
|
||||||
$(AS) -c -o $@ $<
|
$(CC) -c -o $@ $<
|
||||||
|
|
||||||
$(LIB): $(OBJECTS) $(KECCAK4X)
|
$(LIB): $(OBJECTS) $(KECCAK4X)
|
||||||
$(AR) -r $@ $^
|
$(AR) -r $@ $^
|
||||||
|
18
crypto_sign/dilithium2/avx2/cdecl.inc
Normal file
18
crypto_sign/dilithium2/avx2/cdecl.inc
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL
|
||||||
|
#define PQCLEAN_DILITHIUM2_AVX2_CDECL
|
||||||
|
|
||||||
|
/* The C ABI on MacOS exports all symbols with a leading
|
||||||
|
* underscore. This means that any symbols we refer to from
|
||||||
|
* C files (functions) can't be found, and all symbols we
|
||||||
|
* refer to from ASM also can't be found (nttconsts.c).
|
||||||
|
*
|
||||||
|
* This define helps us get around this
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__APPLE__)
|
||||||
|
#define cdecl(s) _##s
|
||||||
|
#else
|
||||||
|
#define cdecl(s) s
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
@ -1,4 +1,5 @@
|
|||||||
.include "shuffle.inc"
|
.include "shuffle.inc"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
|
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
|
||||||
vpaddd %ymm2,%ymm\l0,%ymm12
|
vpaddd %ymm2,%ymm\l0,%ymm12
|
||||||
@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2
|
|||||||
vpsrlq $32,%ymm\h3,%ymm\h3
|
vpsrlq $32,%ymm\h3,%ymm\h3
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm6
|
vmovdqa (%rsi),%ymm6
|
||||||
@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9
|
|||||||
vpsrlq $32,%ymm11,%ymm11
|
vpsrlq $32,%ymm11,%ymm11
|
||||||
|
|
||||||
level1:
|
level1:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
|
||||||
vpmovzxdq 64(%rdx),%ymm15
|
vpmovzxdq 64(%rdx),%ymm15
|
||||||
vpmovzxdq 80(%rdx),%ymm3
|
vpmovzxdq 80(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,8,9,6,7,10,11
|
butterfly 4,5,8,9,6,7,10,11
|
||||||
|
|
||||||
level2:
|
level2:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
|
||||||
vpmovzxdq 96(%rdx),%ymm3
|
vpmovzxdq 96(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
@ -130,7 +131,7 @@ shuffle4 8,9,6,9
|
|||||||
shuffle4 10,11,8,11
|
shuffle4 10,11,8,11
|
||||||
|
|
||||||
level3:
|
level3:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
|
||||||
vpbroadcastd 112(%rdx),%ymm14
|
vpbroadcastd 112(%rdx),%ymm14
|
||||||
vpbroadcastd 116(%rdx),%ymm15
|
vpbroadcastd 116(%rdx),%ymm15
|
||||||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10
|
vpblendd $0xF0,%ymm15,%ymm14,%ymm10
|
||||||
@ -144,7 +145,7 @@ shuffle8 5,7,6,7
|
|||||||
shuffle8 9,11,5,11
|
shuffle8 9,11,5,11
|
||||||
|
|
||||||
level4:
|
level4:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
|
||||||
vpbroadcastd 120(%rdx),%ymm9
|
vpbroadcastd 120(%rdx),%ymm9
|
||||||
|
|
||||||
butterfly 10,3,6,5,4,8,7,11,9,9
|
butterfly 10,3,6,5,4,8,7,11,9,9
|
||||||
@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi)
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9
|
|||||||
vpsrlq $32,%ymm11,%ymm11
|
vpsrlq $32,%ymm11,%ymm11
|
||||||
|
|
||||||
level6:
|
level6:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
|
||||||
vpbroadcastd 16(%rdx),%ymm15
|
vpbroadcastd 16(%rdx),%ymm15
|
||||||
vpbroadcastd 20(%rdx),%ymm3
|
vpbroadcastd 20(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,8,9,6,7,10,11
|
butterfly 4,5,8,9,6,7,10,11
|
||||||
|
|
||||||
level7:
|
level7:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
|
||||||
vpbroadcastd 24(%rdx),%ymm3
|
vpbroadcastd 24(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
|
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
|
||||||
|
|
||||||
vpmuludq %ymm3,%ymm4,%ymm4
|
vpmuludq %ymm3,%ymm4,%ymm4
|
||||||
vpmuludq %ymm3,%ymm5,%ymm5
|
vpmuludq %ymm3,%ymm5,%ymm5
|
||||||
@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
|||||||
vpsrlq $32,%ymm7,%ymm7
|
vpsrlq $32,%ymm7,%ymm7
|
||||||
|
|
||||||
#store
|
#store
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_mask(%rip),%ymm3
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
|
||||||
vpermd %ymm4,%ymm3,%ymm4
|
vpermd %ymm4,%ymm3,%ymm4
|
||||||
vpermd %ymm5,%ymm3,%ymm5
|
vpermd %ymm5,%ymm3,%ymm5
|
||||||
vpermd %ymm6,%ymm3,%ymm6
|
vpermd %ymm6,%ymm3,%ymm6
|
@ -1,4 +1,5 @@
|
|||||||
.include "shuffle.inc"
|
.include "shuffle.inc"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
|
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
|
||||||
#mul
|
#mul
|
||||||
@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2
|
|||||||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
level0:
|
level0:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#zetas
|
||||||
vpbroadcastd (%rdx),%ymm3
|
vpbroadcastd (%rdx),%ymm3
|
||||||
|
|
||||||
#load
|
#load
|
||||||
@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi)
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10
|
|||||||
vmovdqa 224(%rsi),%ymm11
|
vmovdqa 224(%rsi),%ymm11
|
||||||
|
|
||||||
level3:
|
level3:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#zetas
|
||||||
vpbroadcastd (%rdx),%ymm3
|
vpbroadcastd (%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11
|
butterfly 4,5,6,7,8,9,10,11
|
||||||
@ -128,7 +129,7 @@ shuffle8 7,11,6,11
|
|||||||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
|
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
|
||||||
|
|
||||||
level5:
|
level5:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 12(%rdx),%ymm12
|
vpmovzxdq 12(%rdx),%ymm12
|
||||||
|
|
||||||
shuffle4 3,5,7,5
|
shuffle4 3,5,7,5
|
||||||
@ -139,14 +140,14 @@ shuffle4 9,11,4,11
|
|||||||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
|
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
|
||||||
|
|
||||||
level6:
|
level6:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 28(%rdx),%ymm12
|
vpmovzxdq 28(%rdx),%ymm12
|
||||||
vpmovzxdq 44(%rdx),%ymm13
|
vpmovzxdq 44(%rdx),%ymm13
|
||||||
|
|
||||||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
|
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
|
||||||
|
|
||||||
level7:
|
level7:
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 60(%rdx),%ymm12
|
vpmovzxdq 60(%rdx),%ymm12
|
||||||
vpmovzxdq 76(%rdx),%ymm13
|
vpmovzxdq 76(%rdx),%ymm13
|
||||||
vpmovzxdq 92(%rdx),%ymm14
|
vpmovzxdq 92(%rdx),%ymm14
|
@ -1,10 +1,11 @@
|
|||||||
#include "params.h"
|
#include "params.h"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_pointwise_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_pointwise_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop1:
|
_looptop1:
|
||||||
@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4
|
|||||||
vpaddq %ymm9,%ymm5,%ymm5
|
vpaddq %ymm9,%ymm5,%ymm5
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop2:
|
_looptop2:
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
.global PQCLEAN_DILITHIUM3_AVX2_reduce_avx
|
#include "cdecl.inc"
|
||||||
PQCLEAN_DILITHIUM3_AVX2_reduce_avx:
|
|
||||||
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
|
||||||
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x23ones(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_rdc32:
|
_looptop_rdc32:
|
||||||
@ -46,10 +48,10 @@ jb _looptop_rdc32
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_csubq_avx
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_csubq_avx:
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_csubq:
|
_looptop_csubq:
|
||||||
@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3
|
|||||||
vmovdqa 64(%rdi),%ymm5
|
vmovdqa 64(%rdi),%ymm5
|
||||||
vmovdqa 96(%rdi),%ymm7
|
vmovdqa 96(%rdi),%ymm7
|
||||||
|
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_csubq
|
#cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq)
|
||||||
vpsubd %ymm0,%ymm1,%ymm1
|
vpsubd %ymm0,%ymm1,%ymm1
|
||||||
vpsubd %ymm0,%ymm3,%ymm3
|
vpsubd %ymm0,%ymm3,%ymm3
|
||||||
vpsubd %ymm0,%ymm5,%ymm5
|
vpsubd %ymm0,%ymm5,%ymm5
|
@ -23,6 +23,7 @@ implementations:
|
|||||||
supported_platforms:
|
supported_platforms:
|
||||||
- architecture: x86_64
|
- architecture: x86_64
|
||||||
operating_systems:
|
operating_systems:
|
||||||
|
- Darwin
|
||||||
- Linux
|
- Linux
|
||||||
required_flags:
|
required_flags:
|
||||||
- avx2
|
- avx2
|
||||||
|
@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)
|
|||||||
%.o: %.c $(HEADERS)
|
%.o: %.c $(HEADERS)
|
||||||
$(CC) $(CFLAGS) -c -o $@ $<
|
$(CC) $(CFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
%.o: %.s $(HEADERS)
|
|
||||||
$(AS) -o $@ $<
|
|
||||||
|
|
||||||
%.o: %.S $(HEADERS)
|
%.o: %.S $(HEADERS)
|
||||||
$(AS) -c -o $@ $<
|
$(CC) -c -o $@ $<
|
||||||
|
|
||||||
$(LIB): $(OBJECTS) $(KECCAK4X)
|
$(LIB): $(OBJECTS) $(KECCAK4X)
|
||||||
$(AR) -r $@ $^
|
$(AR) -r $@ $^
|
||||||
|
18
crypto_sign/dilithium3/avx2/cdecl.inc
Normal file
18
crypto_sign/dilithium3/avx2/cdecl.inc
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL
|
||||||
|
#define PQCLEAN_DILITHIUM3_AVX2_CDECL
|
||||||
|
|
||||||
|
/* The C ABI on MacOS exports all symbols with a leading
|
||||||
|
* underscore. This means that any symbols we refer to from
|
||||||
|
* C files (functions) can't be found, and all symbols we
|
||||||
|
* refer to from ASM also can't be found (nttconsts.c).
|
||||||
|
*
|
||||||
|
* This define helps us get around this
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__APPLE__)
|
||||||
|
#define cdecl(s) _##s
|
||||||
|
#else
|
||||||
|
#define cdecl(s) s
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
@ -1,4 +1,5 @@
|
|||||||
.include "shuffle.inc"
|
.include "shuffle.inc"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
|
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
|
||||||
vpaddd %ymm2,%ymm\l0,%ymm12
|
vpaddd %ymm2,%ymm\l0,%ymm12
|
||||||
@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2
|
|||||||
vpsrlq $32,%ymm\h3,%ymm\h3
|
vpsrlq $32,%ymm\h3,%ymm\h3
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm6
|
vmovdqa (%rsi),%ymm6
|
||||||
@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9
|
|||||||
vpsrlq $32,%ymm11,%ymm11
|
vpsrlq $32,%ymm11,%ymm11
|
||||||
|
|
||||||
level1:
|
level1:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
|
||||||
vpmovzxdq 64(%rdx),%ymm15
|
vpmovzxdq 64(%rdx),%ymm15
|
||||||
vpmovzxdq 80(%rdx),%ymm3
|
vpmovzxdq 80(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,8,9,6,7,10,11
|
butterfly 4,5,8,9,6,7,10,11
|
||||||
|
|
||||||
level2:
|
level2:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
|
||||||
vpmovzxdq 96(%rdx),%ymm3
|
vpmovzxdq 96(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
@ -130,7 +131,7 @@ shuffle4 8,9,6,9
|
|||||||
shuffle4 10,11,8,11
|
shuffle4 10,11,8,11
|
||||||
|
|
||||||
level3:
|
level3:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
|
||||||
vpbroadcastd 112(%rdx),%ymm14
|
vpbroadcastd 112(%rdx),%ymm14
|
||||||
vpbroadcastd 116(%rdx),%ymm15
|
vpbroadcastd 116(%rdx),%ymm15
|
||||||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10
|
vpblendd $0xF0,%ymm15,%ymm14,%ymm10
|
||||||
@ -144,7 +145,7 @@ shuffle8 5,7,6,7
|
|||||||
shuffle8 9,11,5,11
|
shuffle8 9,11,5,11
|
||||||
|
|
||||||
level4:
|
level4:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
|
||||||
vpbroadcastd 120(%rdx),%ymm9
|
vpbroadcastd 120(%rdx),%ymm9
|
||||||
|
|
||||||
butterfly 10,3,6,5,4,8,7,11,9,9
|
butterfly 10,3,6,5,4,8,7,11,9,9
|
||||||
@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi)
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9
|
|||||||
vpsrlq $32,%ymm11,%ymm11
|
vpsrlq $32,%ymm11,%ymm11
|
||||||
|
|
||||||
level6:
|
level6:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
|
||||||
vpbroadcastd 16(%rdx),%ymm15
|
vpbroadcastd 16(%rdx),%ymm15
|
||||||
vpbroadcastd 20(%rdx),%ymm3
|
vpbroadcastd 20(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,8,9,6,7,10,11
|
butterfly 4,5,8,9,6,7,10,11
|
||||||
|
|
||||||
level7:
|
level7:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
|
||||||
vpbroadcastd 24(%rdx),%ymm3
|
vpbroadcastd 24(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
|
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
|
||||||
|
|
||||||
vpmuludq %ymm3,%ymm4,%ymm4
|
vpmuludq %ymm3,%ymm4,%ymm4
|
||||||
vpmuludq %ymm3,%ymm5,%ymm5
|
vpmuludq %ymm3,%ymm5,%ymm5
|
||||||
@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
|||||||
vpsrlq $32,%ymm7,%ymm7
|
vpsrlq $32,%ymm7,%ymm7
|
||||||
|
|
||||||
#store
|
#store
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_mask(%rip),%ymm3
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
|
||||||
vpermd %ymm4,%ymm3,%ymm4
|
vpermd %ymm4,%ymm3,%ymm4
|
||||||
vpermd %ymm5,%ymm3,%ymm5
|
vpermd %ymm5,%ymm3,%ymm5
|
||||||
vpermd %ymm6,%ymm3,%ymm6
|
vpermd %ymm6,%ymm3,%ymm6
|
@ -1,4 +1,5 @@
|
|||||||
.include "shuffle.inc"
|
.include "shuffle.inc"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
|
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
|
||||||
#mul
|
#mul
|
||||||
@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2
|
|||||||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
level0:
|
level0:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#zetas
|
||||||
vpbroadcastd (%rdx),%ymm3
|
vpbroadcastd (%rdx),%ymm3
|
||||||
|
|
||||||
#load
|
#load
|
||||||
@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi)
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10
|
|||||||
vmovdqa 224(%rsi),%ymm11
|
vmovdqa 224(%rsi),%ymm11
|
||||||
|
|
||||||
level3:
|
level3:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#zetas
|
||||||
vpbroadcastd (%rdx),%ymm3
|
vpbroadcastd (%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11
|
butterfly 4,5,6,7,8,9,10,11
|
||||||
@ -128,7 +129,7 @@ shuffle8 7,11,6,11
|
|||||||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
|
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
|
||||||
|
|
||||||
level5:
|
level5:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 12(%rdx),%ymm12
|
vpmovzxdq 12(%rdx),%ymm12
|
||||||
|
|
||||||
shuffle4 3,5,7,5
|
shuffle4 3,5,7,5
|
||||||
@ -139,14 +140,14 @@ shuffle4 9,11,4,11
|
|||||||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
|
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
|
||||||
|
|
||||||
level6:
|
level6:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 28(%rdx),%ymm12
|
vpmovzxdq 28(%rdx),%ymm12
|
||||||
vpmovzxdq 44(%rdx),%ymm13
|
vpmovzxdq 44(%rdx),%ymm13
|
||||||
|
|
||||||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
|
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
|
||||||
|
|
||||||
level7:
|
level7:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 60(%rdx),%ymm12
|
vpmovzxdq 60(%rdx),%ymm12
|
||||||
vpmovzxdq 76(%rdx),%ymm13
|
vpmovzxdq 76(%rdx),%ymm13
|
||||||
vpmovzxdq 92(%rdx),%ymm14
|
vpmovzxdq 92(%rdx),%ymm14
|
@ -1,10 +1,11 @@
|
|||||||
#include "params.h"
|
#include "params.h"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_pointwise_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_pointwise_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop1:
|
_looptop1:
|
||||||
@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4
|
|||||||
vpaddq %ymm9,%ymm5,%ymm5
|
vpaddq %ymm9,%ymm5,%ymm5
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop2:
|
_looptop2:
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
.global PQCLEAN_DILITHIUM4_AVX2_reduce_avx
|
#include "cdecl.inc"
|
||||||
PQCLEAN_DILITHIUM4_AVX2_reduce_avx:
|
|
||||||
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
|
||||||
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x23ones(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_rdc32:
|
_looptop_rdc32:
|
||||||
@ -46,10 +48,10 @@ jb _looptop_rdc32
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_csubq_avx
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_csubq_avx:
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_csubq:
|
_looptop_csubq:
|
||||||
@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3
|
|||||||
vmovdqa 64(%rdi),%ymm5
|
vmovdqa 64(%rdi),%ymm5
|
||||||
vmovdqa 96(%rdi),%ymm7
|
vmovdqa 96(%rdi),%ymm7
|
||||||
|
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_csubq
|
#cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq)
|
||||||
vpsubd %ymm0,%ymm1,%ymm1
|
vpsubd %ymm0,%ymm1,%ymm1
|
||||||
vpsubd %ymm0,%ymm3,%ymm3
|
vpsubd %ymm0,%ymm3,%ymm3
|
||||||
vpsubd %ymm0,%ymm5,%ymm5
|
vpsubd %ymm0,%ymm5,%ymm5
|
@ -23,6 +23,7 @@ implementations:
|
|||||||
supported_platforms:
|
supported_platforms:
|
||||||
- architecture: x86_64
|
- architecture: x86_64
|
||||||
operating_systems:
|
operating_systems:
|
||||||
|
- Darwin
|
||||||
- Linux
|
- Linux
|
||||||
required_flags:
|
required_flags:
|
||||||
- avx2
|
- avx2
|
||||||
|
@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)
|
|||||||
%.o: %.c $(HEADERS)
|
%.o: %.c $(HEADERS)
|
||||||
$(CC) $(CFLAGS) -c -o $@ $<
|
$(CC) $(CFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
%.o: %.s $(HEADERS)
|
|
||||||
$(AS) -o $@ $<
|
|
||||||
|
|
||||||
%.o: %.S $(HEADERS)
|
%.o: %.S $(HEADERS)
|
||||||
$(AS) -c -o $@ $<
|
$(CC) -c -o $@ $<
|
||||||
|
|
||||||
$(LIB): $(OBJECTS) $(KECCAK4X)
|
$(LIB): $(OBJECTS) $(KECCAK4X)
|
||||||
$(AR) -r $@ $^
|
$(AR) -r $@ $^
|
||||||
|
18
crypto_sign/dilithium4/avx2/cdecl.inc
Normal file
18
crypto_sign/dilithium4/avx2/cdecl.inc
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL
|
||||||
|
#define PQCLEAN_DILITHIUM4_AVX2_CDECL
|
||||||
|
|
||||||
|
/* The C ABI on MacOS exports all symbols with a leading
|
||||||
|
* underscore. This means that any symbols we refer to from
|
||||||
|
* C files (functions) can't be found, and all symbols we
|
||||||
|
* refer to from ASM also can't be found (nttconsts.c).
|
||||||
|
*
|
||||||
|
* This define helps us get around this
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__APPLE__)
|
||||||
|
#define cdecl(s) _##s
|
||||||
|
#else
|
||||||
|
#define cdecl(s) s
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
@ -1,4 +1,5 @@
|
|||||||
.include "shuffle.inc"
|
.include "shuffle.inc"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
|
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
|
||||||
vpaddd %ymm2,%ymm\l0,%ymm12
|
vpaddd %ymm2,%ymm\l0,%ymm12
|
||||||
@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2
|
|||||||
vpsrlq $32,%ymm\h3,%ymm\h3
|
vpsrlq $32,%ymm\h3,%ymm\h3
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm6
|
vmovdqa (%rsi),%ymm6
|
||||||
@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9
|
|||||||
vpsrlq $32,%ymm11,%ymm11
|
vpsrlq $32,%ymm11,%ymm11
|
||||||
|
|
||||||
level1:
|
level1:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
|
||||||
vpmovzxdq 64(%rdx),%ymm15
|
vpmovzxdq 64(%rdx),%ymm15
|
||||||
vpmovzxdq 80(%rdx),%ymm3
|
vpmovzxdq 80(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,8,9,6,7,10,11
|
butterfly 4,5,8,9,6,7,10,11
|
||||||
|
|
||||||
level2:
|
level2:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
|
||||||
vpmovzxdq 96(%rdx),%ymm3
|
vpmovzxdq 96(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
@ -130,7 +131,7 @@ shuffle4 8,9,6,9
|
|||||||
shuffle4 10,11,8,11
|
shuffle4 10,11,8,11
|
||||||
|
|
||||||
level3:
|
level3:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
|
||||||
vpbroadcastd 112(%rdx),%ymm14
|
vpbroadcastd 112(%rdx),%ymm14
|
||||||
vpbroadcastd 116(%rdx),%ymm15
|
vpbroadcastd 116(%rdx),%ymm15
|
||||||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10
|
vpblendd $0xF0,%ymm15,%ymm14,%ymm10
|
||||||
@ -144,7 +145,7 @@ shuffle8 5,7,6,7
|
|||||||
shuffle8 9,11,5,11
|
shuffle8 9,11,5,11
|
||||||
|
|
||||||
level4:
|
level4:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
|
||||||
vpbroadcastd 120(%rdx),%ymm9
|
vpbroadcastd 120(%rdx),%ymm9
|
||||||
|
|
||||||
butterfly 10,3,6,5,4,8,7,11,9,9
|
butterfly 10,3,6,5,4,8,7,11,9,9
|
||||||
@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi)
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
|
||||||
PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9
|
|||||||
vpsrlq $32,%ymm11,%ymm11
|
vpsrlq $32,%ymm11,%ymm11
|
||||||
|
|
||||||
level6:
|
level6:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
|
||||||
vpbroadcastd 16(%rdx),%ymm15
|
vpbroadcastd 16(%rdx),%ymm15
|
||||||
vpbroadcastd 20(%rdx),%ymm3
|
vpbroadcastd 20(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,8,9,6,7,10,11
|
butterfly 4,5,8,9,6,7,10,11
|
||||||
|
|
||||||
level7:
|
level7:
|
||||||
#PQCLEAN_DILITHIUM3_AVX2_zetas
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
|
||||||
vpbroadcastd 24(%rdx),%ymm3
|
vpbroadcastd 24(%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
|
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
|
||||||
|
|
||||||
vpmuludq %ymm3,%ymm4,%ymm4
|
vpmuludq %ymm3,%ymm4,%ymm4
|
||||||
vpmuludq %ymm3,%ymm5,%ymm5
|
vpmuludq %ymm3,%ymm5,%ymm5
|
||||||
@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
|||||||
vpsrlq $32,%ymm7,%ymm7
|
vpsrlq $32,%ymm7,%ymm7
|
||||||
|
|
||||||
#store
|
#store
|
||||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_mask(%rip),%ymm3
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
|
||||||
vpermd %ymm4,%ymm3,%ymm4
|
vpermd %ymm4,%ymm3,%ymm4
|
||||||
vpermd %ymm5,%ymm3,%ymm5
|
vpermd %ymm5,%ymm3,%ymm5
|
||||||
vpermd %ymm6,%ymm3,%ymm6
|
vpermd %ymm6,%ymm3,%ymm6
|
@ -1,4 +1,5 @@
|
|||||||
.include "shuffle.inc"
|
.include "shuffle.inc"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
|
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
|
||||||
#mul
|
#mul
|
||||||
@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2
|
|||||||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
level0:
|
level0:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#zetas
|
||||||
vpbroadcastd (%rdx),%ymm3
|
vpbroadcastd (%rdx),%ymm3
|
||||||
|
|
||||||
#load
|
#load
|
||||||
@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi)
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10
|
|||||||
vmovdqa 224(%rsi),%ymm11
|
vmovdqa 224(%rsi),%ymm11
|
||||||
|
|
||||||
level3:
|
level3:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#zetas
|
||||||
vpbroadcastd (%rdx),%ymm3
|
vpbroadcastd (%rdx),%ymm3
|
||||||
|
|
||||||
butterfly 4,5,6,7,8,9,10,11
|
butterfly 4,5,6,7,8,9,10,11
|
||||||
@ -128,7 +129,7 @@ shuffle8 7,11,6,11
|
|||||||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
|
butterfly 3,8,4,9,5,10,6,11,12,12,12,12
|
||||||
|
|
||||||
level5:
|
level5:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 12(%rdx),%ymm12
|
vpmovzxdq 12(%rdx),%ymm12
|
||||||
|
|
||||||
shuffle4 3,5,7,5
|
shuffle4 3,5,7,5
|
||||||
@ -139,14 +140,14 @@ shuffle4 9,11,4,11
|
|||||||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
|
butterfly 7,5,3,10,8,6,4,11,12,12,12,12
|
||||||
|
|
||||||
level6:
|
level6:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 28(%rdx),%ymm12
|
vpmovzxdq 28(%rdx),%ymm12
|
||||||
vpmovzxdq 44(%rdx),%ymm13
|
vpmovzxdq 44(%rdx),%ymm13
|
||||||
|
|
||||||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
|
butterfly 7,5,8,6,3,10,4,11,12,12,13,13
|
||||||
|
|
||||||
level7:
|
level7:
|
||||||
#PQCLEAN_DILITHIUM4_AVX2_zetas
|
#zetas
|
||||||
vpmovzxdq 60(%rdx),%ymm12
|
vpmovzxdq 60(%rdx),%ymm12
|
||||||
vpmovzxdq 76(%rdx),%ymm13
|
vpmovzxdq 76(%rdx),%ymm13
|
||||||
vpmovzxdq 92(%rdx),%ymm14
|
vpmovzxdq 92(%rdx),%ymm14
|
@ -1,10 +1,11 @@
|
|||||||
#include "params.h"
|
#include "params.h"
|
||||||
|
#include "cdecl.inc"
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_pointwise_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_pointwise_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop1:
|
_looptop1:
|
||||||
@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4
|
|||||||
vpaddq %ymm9,%ymm5,%ymm5
|
vpaddq %ymm9,%ymm5,%ymm5
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
|
||||||
PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop2:
|
_looptop2:
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
.global PQCLEAN_DILITHIUM2_AVX2_reduce_avx
|
#include "cdecl.inc"
|
||||||
PQCLEAN_DILITHIUM2_AVX2_reduce_avx:
|
|
||||||
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
|
||||||
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x23ones(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_rdc32:
|
_looptop_rdc32:
|
||||||
@ -46,10 +48,10 @@ jb _looptop_rdc32
|
|||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.global PQCLEAN_DILITHIUM2_AVX2_csubq_avx
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
|
||||||
PQCLEAN_DILITHIUM2_AVX2_csubq_avx:
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm0
|
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_csubq:
|
_looptop_csubq:
|
||||||
@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3
|
|||||||
vmovdqa 64(%rdi),%ymm5
|
vmovdqa 64(%rdi),%ymm5
|
||||||
vmovdqa 96(%rdi),%ymm7
|
vmovdqa 96(%rdi),%ymm7
|
||||||
|
|
||||||
#PQCLEAN_DILITHIUM2_AVX2_csubq
|
#cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq)
|
||||||
vpsubd %ymm0,%ymm1,%ymm1
|
vpsubd %ymm0,%ymm1,%ymm1
|
||||||
vpsubd %ymm0,%ymm3,%ymm3
|
vpsubd %ymm0,%ymm3,%ymm3
|
||||||
vpsubd %ymm0,%ymm5,%ymm5
|
vpsubd %ymm0,%ymm5,%ymm5
|
Loading…
Reference in New Issue
Block a user