1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-22 23:48:58 +00:00

Add MacOS support for Dilithium

This commit is contained in:
Thom Wiggers 2019-12-11 15:06:02 +01:00 committed by Kris Kwiatkowski
parent ebb416a2ba
commit 44b0522070
21 changed files with 219 additions and 156 deletions

View File

@ -23,6 +23,7 @@ implementations:
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:
- Darwin
- Linux - Linux
required_flags: required_flags:
- avx2 - avx2

View File

@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)
%.o: %.c $(HEADERS) %.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $< $(CC) $(CFLAGS) -c -o $@ $<
%.o: %.s $(HEADERS)
$(AS) -o $@ $<
%.o: %.S $(HEADERS) %.o: %.S $(HEADERS)
$(AS) -c -o $@ $< $(CC) -c -o $@ $<
$(LIB): $(OBJECTS) $(KECCAK4X) $(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $^ $(AR) -r $@ $^

View File

@ -0,0 +1,18 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL
#define PQCLEAN_DILITHIUM2_AVX2_CDECL
/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/
#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define cdecl(s) s
#endif
#endif

View File

@ -1,4 +1,5 @@
.include "shuffle.inc" .include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
vpaddd %ymm2,%ymm\l0,%ymm12 vpaddd %ymm2,%ymm\l0,%ymm12
@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2
vpsrlq $32,%ymm\h3,%ymm\h3 vpsrlq $32,%ymm\h3,%ymm\h3
.endm .endm
.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm6 vmovdqa (%rsi),%ymm6
@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11 vpsrlq $32,%ymm11,%ymm11
level1: level1:
#PQCLEAN_DILITHIUM2_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpmovzxdq 64(%rdx),%ymm15 vpmovzxdq 64(%rdx),%ymm15
vpmovzxdq 80(%rdx),%ymm3 vpmovzxdq 80(%rdx),%ymm3
butterfly 4,5,8,9,6,7,10,11 butterfly 4,5,8,9,6,7,10,11
level2: level2:
#PQCLEAN_DILITHIUM2_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpmovzxdq 96(%rdx),%ymm3 vpmovzxdq 96(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3 butterfly 4,5,6,7,8,9,10,11,3,3
@ -130,7 +131,7 @@ shuffle4 8,9,6,9
shuffle4 10,11,8,11 shuffle4 10,11,8,11
level3: level3:
#PQCLEAN_DILITHIUM2_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 112(%rdx),%ymm14
vpbroadcastd 116(%rdx),%ymm15 vpbroadcastd 116(%rdx),%ymm15
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 vpblendd $0xF0,%ymm15,%ymm14,%ymm10
@ -144,7 +145,7 @@ shuffle8 5,7,6,7
shuffle8 9,11,5,11 shuffle8 9,11,5,11
level4: level4:
#PQCLEAN_DILITHIUM2_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 120(%rdx),%ymm9 vpbroadcastd 120(%rdx),%ymm9
butterfly 10,3,6,5,4,8,7,11,9,9 butterfly 10,3,6,5,4,8,7,11,9,9
@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi)
ret ret
.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm4 vmovdqa (%rsi),%ymm4
@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11 vpsrlq $32,%ymm11,%ymm11
level6: level6:
#PQCLEAN_DILITHIUM2_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 16(%rdx),%ymm15 vpbroadcastd 16(%rdx),%ymm15
vpbroadcastd 20(%rdx),%ymm3 vpbroadcastd 20(%rdx),%ymm3
butterfly 4,5,8,9,6,7,10,11 butterfly 4,5,8,9,6,7,10,11
level7: level7:
#PQCLEAN_DILITHIUM2_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas)
vpbroadcastd 24(%rdx),%ymm3 vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3 butterfly 4,5,6,7,8,9,10,11,3,3
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5 vpmuludq %ymm3,%ymm5,%ymm5
@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7 vpsrlq $32,%ymm7,%ymm7
#store #store
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_mask(%rip),%ymm3 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6 vpermd %ymm6,%ymm3,%ymm6

View File

@ -1,4 +1,5 @@
.include "shuffle.inc" .include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
#mul #mul
@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm .endm
.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
level0: level0:
#PQCLEAN_DILITHIUM2_AVX2_zetas #zetas
vpbroadcastd (%rdx),%ymm3 vpbroadcastd (%rdx),%ymm3
#load #load
@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi)
ret ret
.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm4 vmovdqa (%rsi),%ymm4
@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm11
level3: level3:
#PQCLEAN_DILITHIUM2_AVX2_zetas #zetas
vpbroadcastd (%rdx),%ymm3 vpbroadcastd (%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11 butterfly 4,5,6,7,8,9,10,11
@ -128,7 +129,7 @@ shuffle8 7,11,6,11
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 butterfly 3,8,4,9,5,10,6,11,12,12,12,12
level5: level5:
#PQCLEAN_DILITHIUM2_AVX2_zetas #zetas
vpmovzxdq 12(%rdx),%ymm12 vpmovzxdq 12(%rdx),%ymm12
shuffle4 3,5,7,5 shuffle4 3,5,7,5
@ -139,14 +140,14 @@ shuffle4 9,11,4,11
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 butterfly 7,5,3,10,8,6,4,11,12,12,12,12
level6: level6:
#PQCLEAN_DILITHIUM2_AVX2_zetas #zetas
vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13 vpmovzxdq 44(%rdx),%ymm13
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 butterfly 7,5,8,6,3,10,4,11,12,12,13,13
level7: level7:
#PQCLEAN_DILITHIUM2_AVX2_zetas #zetas
vpmovzxdq 60(%rdx),%ymm12 vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14 vpmovzxdq 92(%rdx),%ymm14

View File

@ -1,10 +1,11 @@
#include "params.h" #include "params.h"
#include "cdecl.inc"
.global PQCLEAN_DILITHIUM2_AVX2_pointwise_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
PQCLEAN_DILITHIUM2_AVX2_pointwise_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax xor %eax,%eax
_looptop1: _looptop1:
@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5 vpaddq %ymm9,%ymm5,%ymm5
.endm .endm
.global PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax xor %eax,%eax
_looptop2: _looptop2:

View File

@ -1,7 +1,9 @@
.global PQCLEAN_DILITHIUM3_AVX2_reduce_avx #include "cdecl.inc"
PQCLEAN_DILITHIUM3_AVX2_reduce_avx:
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x23ones(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
xor %eax,%eax xor %eax,%eax
_looptop_rdc32: _looptop_rdc32:
@ -46,10 +48,10 @@ jb _looptop_rdc32
ret ret
.global PQCLEAN_DILITHIUM3_AVX2_csubq_avx .global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
PQCLEAN_DILITHIUM3_AVX2_csubq_avx: cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
xor %eax,%eax xor %eax,%eax
_looptop_csubq: _looptop_csubq:
@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5 vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7 vmovdqa 96(%rdi),%ymm7
#PQCLEAN_DILITHIUM3_AVX2_csubq #cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq)
vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5 vpsubd %ymm0,%ymm5,%ymm5

View File

@ -23,6 +23,7 @@ implementations:
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:
- Darwin
- Linux - Linux
required_flags: required_flags:
- avx2 - avx2

View File

@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)
%.o: %.c $(HEADERS) %.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $< $(CC) $(CFLAGS) -c -o $@ $<
%.o: %.s $(HEADERS)
$(AS) -o $@ $<
%.o: %.S $(HEADERS) %.o: %.S $(HEADERS)
$(AS) -c -o $@ $< $(CC) -c -o $@ $<
$(LIB): $(OBJECTS) $(KECCAK4X) $(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $^ $(AR) -r $@ $^

View File

@ -0,0 +1,18 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL
#define PQCLEAN_DILITHIUM3_AVX2_CDECL
/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/
#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define cdecl(s) s
#endif
#endif

View File

@ -1,4 +1,5 @@
.include "shuffle.inc" .include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
vpaddd %ymm2,%ymm\l0,%ymm12 vpaddd %ymm2,%ymm\l0,%ymm12
@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2
vpsrlq $32,%ymm\h3,%ymm\h3 vpsrlq $32,%ymm\h3,%ymm\h3
.endm .endm
.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm6 vmovdqa (%rsi),%ymm6
@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11 vpsrlq $32,%ymm11,%ymm11
level1: level1:
#PQCLEAN_DILITHIUM4_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
vpmovzxdq 64(%rdx),%ymm15 vpmovzxdq 64(%rdx),%ymm15
vpmovzxdq 80(%rdx),%ymm3 vpmovzxdq 80(%rdx),%ymm3
butterfly 4,5,8,9,6,7,10,11 butterfly 4,5,8,9,6,7,10,11
level2: level2:
#PQCLEAN_DILITHIUM4_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
vpmovzxdq 96(%rdx),%ymm3 vpmovzxdq 96(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3 butterfly 4,5,6,7,8,9,10,11,3,3
@ -130,7 +131,7 @@ shuffle4 8,9,6,9
shuffle4 10,11,8,11 shuffle4 10,11,8,11
level3: level3:
#PQCLEAN_DILITHIUM4_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 112(%rdx),%ymm14
vpbroadcastd 116(%rdx),%ymm15 vpbroadcastd 116(%rdx),%ymm15
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 vpblendd $0xF0,%ymm15,%ymm14,%ymm10
@ -144,7 +145,7 @@ shuffle8 5,7,6,7
shuffle8 9,11,5,11 shuffle8 9,11,5,11
level4: level4:
#PQCLEAN_DILITHIUM4_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
vpbroadcastd 120(%rdx),%ymm9 vpbroadcastd 120(%rdx),%ymm9
butterfly 10,3,6,5,4,8,7,11,9,9 butterfly 10,3,6,5,4,8,7,11,9,9
@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi)
ret ret
.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm4 vmovdqa (%rsi),%ymm4
@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11 vpsrlq $32,%ymm11,%ymm11
level6: level6:
#PQCLEAN_DILITHIUM4_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
vpbroadcastd 16(%rdx),%ymm15 vpbroadcastd 16(%rdx),%ymm15
vpbroadcastd 20(%rdx),%ymm3 vpbroadcastd 20(%rdx),%ymm3
butterfly 4,5,8,9,6,7,10,11 butterfly 4,5,8,9,6,7,10,11
level7: level7:
#PQCLEAN_DILITHIUM4_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas)
vpbroadcastd 24(%rdx),%ymm3 vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3 butterfly 4,5,6,7,8,9,10,11,3,3
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5 vpmuludq %ymm3,%ymm5,%ymm5
@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7 vpsrlq $32,%ymm7,%ymm7
#store #store
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_mask(%rip),%ymm3 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6 vpermd %ymm6,%ymm3,%ymm6

View File

@ -1,4 +1,5 @@
.include "shuffle.inc" .include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
#mul #mul
@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm .endm
.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
level0: level0:
#PQCLEAN_DILITHIUM3_AVX2_zetas #zetas
vpbroadcastd (%rdx),%ymm3 vpbroadcastd (%rdx),%ymm3
#load #load
@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi)
ret ret
.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm4 vmovdqa (%rsi),%ymm4
@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm11
level3: level3:
#PQCLEAN_DILITHIUM3_AVX2_zetas #zetas
vpbroadcastd (%rdx),%ymm3 vpbroadcastd (%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11 butterfly 4,5,6,7,8,9,10,11
@ -128,7 +129,7 @@ shuffle8 7,11,6,11
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 butterfly 3,8,4,9,5,10,6,11,12,12,12,12
level5: level5:
#PQCLEAN_DILITHIUM3_AVX2_zetas #zetas
vpmovzxdq 12(%rdx),%ymm12 vpmovzxdq 12(%rdx),%ymm12
shuffle4 3,5,7,5 shuffle4 3,5,7,5
@ -139,14 +140,14 @@ shuffle4 9,11,4,11
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 butterfly 7,5,3,10,8,6,4,11,12,12,12,12
level6: level6:
#PQCLEAN_DILITHIUM3_AVX2_zetas #zetas
vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13 vpmovzxdq 44(%rdx),%ymm13
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 butterfly 7,5,8,6,3,10,4,11,12,12,13,13
level7: level7:
#PQCLEAN_DILITHIUM3_AVX2_zetas #zetas
vpmovzxdq 60(%rdx),%ymm12 vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14 vpmovzxdq 92(%rdx),%ymm14

View File

@ -1,10 +1,11 @@
#include "params.h" #include "params.h"
#include "cdecl.inc"
.global PQCLEAN_DILITHIUM3_AVX2_pointwise_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
PQCLEAN_DILITHIUM3_AVX2_pointwise_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax xor %eax,%eax
_looptop1: _looptop1:
@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5 vpaddq %ymm9,%ymm5,%ymm5
.endm .endm
.global PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax xor %eax,%eax
_looptop2: _looptop2:

View File

@ -1,7 +1,9 @@
.global PQCLEAN_DILITHIUM4_AVX2_reduce_avx #include "cdecl.inc"
PQCLEAN_DILITHIUM4_AVX2_reduce_avx:
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x23ones(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
xor %eax,%eax xor %eax,%eax
_looptop_rdc32: _looptop_rdc32:
@ -46,10 +48,10 @@ jb _looptop_rdc32
ret ret
.global PQCLEAN_DILITHIUM4_AVX2_csubq_avx .global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
PQCLEAN_DILITHIUM4_AVX2_csubq_avx: cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
xor %eax,%eax xor %eax,%eax
_looptop_csubq: _looptop_csubq:
@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5 vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7 vmovdqa 96(%rdi),%ymm7
#PQCLEAN_DILITHIUM4_AVX2_csubq #cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq)
vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5 vpsubd %ymm0,%ymm5,%ymm5

View File

@ -23,6 +23,7 @@ implementations:
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:
- Darwin
- Linux - Linux
required_flags: required_flags:
- avx2 - avx2

View File

@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)
%.o: %.c $(HEADERS) %.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $< $(CC) $(CFLAGS) -c -o $@ $<
%.o: %.s $(HEADERS)
$(AS) -o $@ $<
%.o: %.S $(HEADERS) %.o: %.S $(HEADERS)
$(AS) -c -o $@ $< $(CC) -c -o $@ $<
$(LIB): $(OBJECTS) $(KECCAK4X) $(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $^ $(AR) -r $@ $^

View File

@ -0,0 +1,18 @@
#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL
#define PQCLEAN_DILITHIUM4_AVX2_CDECL
/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/
#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define cdecl(s) s
#endif
#endif

View File

@ -1,4 +1,5 @@
.include "shuffle.inc" .include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3
vpaddd %ymm2,%ymm\l0,%ymm12 vpaddd %ymm2,%ymm\l0,%ymm12
@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2
vpsrlq $32,%ymm\h3,%ymm\h3 vpsrlq $32,%ymm\h3,%ymm\h3
.endm .endm
.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm6 vmovdqa (%rsi),%ymm6
@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11 vpsrlq $32,%ymm11,%ymm11
level1: level1:
#PQCLEAN_DILITHIUM3_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
vpmovzxdq 64(%rdx),%ymm15 vpmovzxdq 64(%rdx),%ymm15
vpmovzxdq 80(%rdx),%ymm3 vpmovzxdq 80(%rdx),%ymm3
butterfly 4,5,8,9,6,7,10,11 butterfly 4,5,8,9,6,7,10,11
level2: level2:
#PQCLEAN_DILITHIUM3_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
vpmovzxdq 96(%rdx),%ymm3 vpmovzxdq 96(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3 butterfly 4,5,6,7,8,9,10,11,3,3
@ -130,7 +131,7 @@ shuffle4 8,9,6,9
shuffle4 10,11,8,11 shuffle4 10,11,8,11
level3: level3:
#PQCLEAN_DILITHIUM3_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 112(%rdx),%ymm14
vpbroadcastd 116(%rdx),%ymm15 vpbroadcastd 116(%rdx),%ymm15
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 vpblendd $0xF0,%ymm15,%ymm14,%ymm10
@ -144,7 +145,7 @@ shuffle8 5,7,6,7
shuffle8 9,11,5,11 shuffle8 9,11,5,11
level4: level4:
#PQCLEAN_DILITHIUM3_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
vpbroadcastd 120(%rdx),%ymm9 vpbroadcastd 120(%rdx),%ymm9
butterfly 10,3,6,5,4,8,7,11,9,9 butterfly 10,3,6,5,4,8,7,11,9,9
@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi)
ret ret
.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm4 vmovdqa (%rsi),%ymm4
@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11 vpsrlq $32,%ymm11,%ymm11
level6: level6:
#PQCLEAN_DILITHIUM3_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
vpbroadcastd 16(%rdx),%ymm15 vpbroadcastd 16(%rdx),%ymm15
vpbroadcastd 20(%rdx),%ymm3 vpbroadcastd 20(%rdx),%ymm3
butterfly 4,5,8,9,6,7,10,11 butterfly 4,5,8,9,6,7,10,11
level7: level7:
#PQCLEAN_DILITHIUM3_AVX2_zetas #cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas)
vpbroadcastd 24(%rdx),%ymm3 vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3 butterfly 4,5,6,7,8,9,10,11,3,3
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5 vpmuludq %ymm3,%ymm5,%ymm5
@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7 vpsrlq $32,%ymm7,%ymm7
#store #store
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_mask(%rip),%ymm3 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6 vpermd %ymm6,%ymm3,%ymm6

View File

@ -1,4 +1,5 @@
.include "shuffle.inc" .include "shuffle.inc"
#include "cdecl.inc"
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3
#mul #mul
@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm .endm
.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
level0: level0:
#PQCLEAN_DILITHIUM4_AVX2_zetas #zetas
vpbroadcastd (%rdx),%ymm3 vpbroadcastd (%rdx),%ymm3
#load #load
@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi)
ret ret
.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
#load #load
vmovdqa (%rsi),%ymm4 vmovdqa (%rsi),%ymm4
@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm11
level3: level3:
#PQCLEAN_DILITHIUM4_AVX2_zetas #zetas
vpbroadcastd (%rdx),%ymm3 vpbroadcastd (%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11 butterfly 4,5,6,7,8,9,10,11
@ -128,7 +129,7 @@ shuffle8 7,11,6,11
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 butterfly 3,8,4,9,5,10,6,11,12,12,12,12
level5: level5:
#PQCLEAN_DILITHIUM4_AVX2_zetas #zetas
vpmovzxdq 12(%rdx),%ymm12 vpmovzxdq 12(%rdx),%ymm12
shuffle4 3,5,7,5 shuffle4 3,5,7,5
@ -139,14 +140,14 @@ shuffle4 9,11,4,11
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 butterfly 7,5,3,10,8,6,4,11,12,12,12,12
level6: level6:
#PQCLEAN_DILITHIUM4_AVX2_zetas #zetas
vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13 vpmovzxdq 44(%rdx),%ymm13
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 butterfly 7,5,8,6,3,10,4,11,12,12,13,13
level7: level7:
#PQCLEAN_DILITHIUM4_AVX2_zetas #zetas
vpmovzxdq 60(%rdx),%ymm12 vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14 vpmovzxdq 92(%rdx),%ymm14

View File

@ -1,10 +1,11 @@
#include "params.h" #include "params.h"
#include "cdecl.inc"
.global PQCLEAN_DILITHIUM4_AVX2_pointwise_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
PQCLEAN_DILITHIUM4_AVX2_pointwise_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax xor %eax,%eax
_looptop1: _looptop1:
@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5 vpaddq %ymm9,%ymm5,%ymm5
.endm .endm
.global PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax xor %eax,%eax
_looptop2: _looptop2:

View File

@ -1,7 +1,9 @@
.global PQCLEAN_DILITHIUM2_AVX2_reduce_avx #include "cdecl.inc"
PQCLEAN_DILITHIUM2_AVX2_reduce_avx:
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x23ones(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
xor %eax,%eax xor %eax,%eax
_looptop_rdc32: _looptop_rdc32:
@ -46,10 +48,10 @@ jb _looptop_rdc32
ret ret
.global PQCLEAN_DILITHIUM2_AVX2_csubq_avx .global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
PQCLEAN_DILITHIUM2_AVX2_csubq_avx: cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
#consts #consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm0 vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
xor %eax,%eax xor %eax,%eax
_looptop_csubq: _looptop_csubq:
@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5 vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7 vmovdqa 96(%rdi),%ymm7
#PQCLEAN_DILITHIUM2_AVX2_csubq #cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq)
vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5 vpsubd %ymm0,%ymm5,%ymm5