From 44b05220700885a466b421324c1e144824052e1f Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Wed, 11 Dec 2019 15:06:02 +0100 Subject: [PATCH] Add MacOS support for Dilithium --- crypto_sign/dilithium2/META.yml | 1 + crypto_sign/dilithium2/avx2/Makefile | 5 +-- crypto_sign/dilithium2/avx2/cdecl.inc | 18 +++++++++ .../dilithium2/avx2/{invntt.s => invntt.S} | 37 ++++++++++--------- crypto_sign/dilithium2/avx2/{ntt.s => ntt.S} | 31 ++++++++-------- crypto_sign/dilithium2/avx2/pointwise.S | 17 +++++---- .../reduce.s => dilithium2/avx2/reduce.S} | 16 ++++---- crypto_sign/dilithium3/META.yml | 1 + crypto_sign/dilithium3/avx2/Makefile | 5 +-- crypto_sign/dilithium3/avx2/cdecl.inc | 18 +++++++++ .../invntt.s => dilithium3/avx2/invntt.S} | 37 ++++++++++--------- crypto_sign/dilithium3/avx2/{ntt.s => ntt.S} | 31 ++++++++-------- crypto_sign/dilithium3/avx2/pointwise.S | 17 +++++---- .../reduce.s => dilithium3/avx2/reduce.S} | 16 ++++---- crypto_sign/dilithium4/META.yml | 1 + crypto_sign/dilithium4/avx2/Makefile | 5 +-- crypto_sign/dilithium4/avx2/cdecl.inc | 18 +++++++++ .../invntt.s => dilithium4/avx2/invntt.S} | 37 ++++++++++--------- crypto_sign/dilithium4/avx2/{ntt.s => ntt.S} | 31 ++++++++-------- crypto_sign/dilithium4/avx2/pointwise.S | 17 +++++---- .../reduce.s => dilithium4/avx2/reduce.S} | 16 ++++---- 21 files changed, 219 insertions(+), 156 deletions(-) create mode 100644 crypto_sign/dilithium2/avx2/cdecl.inc rename crypto_sign/dilithium2/avx2/{invntt.s => invntt.S} (85%) rename crypto_sign/dilithium2/avx2/{ntt.s => ntt.S} (83%) rename crypto_sign/{dilithium3/avx2/reduce.s => dilithium2/avx2/reduce.S} (80%) create mode 100644 crypto_sign/dilithium3/avx2/cdecl.inc rename crypto_sign/{dilithium4/avx2/invntt.s => dilithium3/avx2/invntt.S} (85%) rename crypto_sign/dilithium3/avx2/{ntt.s => ntt.S} (83%) rename crypto_sign/{dilithium4/avx2/reduce.s => dilithium3/avx2/reduce.S} (80%) create mode 100644 crypto_sign/dilithium4/avx2/cdecl.inc rename crypto_sign/{dilithium3/avx2/invntt.s => dilithium4/avx2/invntt.S} (85%) rename crypto_sign/dilithium4/avx2/{ntt.s => ntt.S} (83%) rename crypto_sign/{dilithium2/avx2/reduce.s => dilithium4/avx2/reduce.S} (80%) diff --git a/crypto_sign/dilithium2/META.yml b/crypto_sign/dilithium2/META.yml index 6761d77d..e6af218d 100644 --- a/crypto_sign/dilithium2/META.yml +++ b/crypto_sign/dilithium2/META.yml @@ -23,6 +23,7 @@ implementations: supported_platforms: - architecture: x86_64 operating_systems: + - Darwin - Linux required_flags: - avx2 diff --git a/crypto_sign/dilithium2/avx2/Makefile b/crypto_sign/dilithium2/avx2/Makefile index 8308ea46..7471c4d1 100644 --- a/crypto_sign/dilithium2/avx2/Makefile +++ b/crypto_sign/dilithium2/avx2/Makefile @@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) $(AR) -r $@ $^ diff --git a/crypto_sign/dilithium2/avx2/cdecl.inc b/crypto_sign/dilithium2/avx2/cdecl.inc new file mode 100644 index 00000000..3e290d89 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/cdecl.inc @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_sign/dilithium2/avx2/invntt.s b/crypto_sign/dilithium2/avx2/invntt.S similarity index 85% rename from crypto_sign/dilithium2/avx2/invntt.s rename to crypto_sign/dilithium2/avx2/invntt.S index 3a943f62..17cf515a 100644 --- a/crypto_sign/dilithium2/avx2/invntt.s +++ b/crypto_sign/dilithium2/avx2/invntt.S @@ -1,4 +1,5 @@ .include "shuffle.inc" +#include "cdecl.inc" .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 vpaddd %ymm2,%ymm\l0,%ymm12 @@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2 vpsrlq $32,%ymm\h3,%ymm\h3 .endm -.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx -PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm6 @@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9 vpsrlq $32,%ymm11,%ymm11 level1: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) vpmovzxdq 64(%rdx),%ymm15 vpmovzxdq 80(%rdx),%ymm3 butterfly 4,5,8,9,6,7,10,11 level2: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) vpmovzxdq 96(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 @@ -130,7 +131,7 @@ shuffle4 8,9,6,9 shuffle4 10,11,8,11 level3: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 116(%rdx),%ymm15 vpblendd $0xF0,%ymm15,%ymm14,%ymm10 @@ -144,7 +145,7 @@ shuffle8 5,7,6,7 shuffle8 9,11,5,11 level4: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) vpbroadcastd 120(%rdx),%ymm9 butterfly 10,3,6,5,4,8,7,11,9,9 @@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx -PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9 vpsrlq $32,%ymm11,%ymm11 level6: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) vpbroadcastd 16(%rdx),%ymm15 vpbroadcastd 20(%rdx),%ymm3 butterfly 4,5,8,9,6,7,10,11 level7: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) vpbroadcastd 24(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm5,%ymm5 @@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 vpsrlq $32,%ymm7,%ymm7 #store -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_mask(%rip),%ymm3 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm6,%ymm3,%ymm6 diff --git a/crypto_sign/dilithium2/avx2/ntt.s b/crypto_sign/dilithium2/avx2/ntt.S similarity index 83% rename from crypto_sign/dilithium2/avx2/ntt.s rename to crypto_sign/dilithium2/avx2/ntt.S index ed329dd3..4110ef4a 100644 --- a/crypto_sign/dilithium2/avx2/ntt.s +++ b/crypto_sign/dilithium2/avx2/ntt.S @@ -1,4 +1,5 @@ .include "shuffle.inc" +#include "cdecl.inc" .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 #mul @@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2 vpsubd %ymm15,%ymm\rh3,%ymm\rh3 .endm -.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx -PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 level0: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#zetas vpbroadcastd (%rdx),%ymm3 #load @@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi) ret -.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx -PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10 vmovdqa 224(%rsi),%ymm11 level3: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#zetas vpbroadcastd (%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11 @@ -128,7 +129,7 @@ shuffle8 7,11,6,11 butterfly 3,8,4,9,5,10,6,11,12,12,12,12 level5: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#zetas vpmovzxdq 12(%rdx),%ymm12 shuffle4 3,5,7,5 @@ -139,14 +140,14 @@ shuffle4 9,11,4,11 butterfly 7,5,3,10,8,6,4,11,12,12,12,12 level6: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#zetas vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 44(%rdx),%ymm13 butterfly 7,5,8,6,3,10,4,11,12,12,13,13 level7: -#PQCLEAN_DILITHIUM2_AVX2_zetas +#zetas vpmovzxdq 60(%rdx),%ymm12 vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 92(%rdx),%ymm14 diff --git a/crypto_sign/dilithium2/avx2/pointwise.S b/crypto_sign/dilithium2/avx2/pointwise.S index fa2ab766..d0132791 100644 --- a/crypto_sign/dilithium2/avx2/pointwise.S +++ b/crypto_sign/dilithium2/avx2/pointwise.S @@ -1,10 +1,11 @@ #include "params.h" +#include "cdecl.inc" -.global PQCLEAN_DILITHIUM2_AVX2_pointwise_avx -PQCLEAN_DILITHIUM2_AVX2_pointwise_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop1: @@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4 vpaddq %ymm9,%ymm5,%ymm5 .endm -.global PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx -PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop2: diff --git a/crypto_sign/dilithium3/avx2/reduce.s b/crypto_sign/dilithium2/avx2/reduce.S similarity index 80% rename from crypto_sign/dilithium3/avx2/reduce.s rename to crypto_sign/dilithium2/avx2/reduce.S index ca14e432..c02d5973 100644 --- a/crypto_sign/dilithium3/avx2/reduce.s +++ b/crypto_sign/dilithium2/avx2/reduce.S @@ -1,7 +1,9 @@ -.global PQCLEAN_DILITHIUM3_AVX2_reduce_avx -PQCLEAN_DILITHIUM3_AVX2_reduce_avx: +#include "cdecl.inc" + +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x23ones(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: @@ -46,10 +48,10 @@ jb _looptop_rdc32 ret -.global PQCLEAN_DILITHIUM3_AVX2_csubq_avx -PQCLEAN_DILITHIUM3_AVX2_csubq_avx: +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: @@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3 vmovdqa 64(%rdi),%ymm5 vmovdqa 96(%rdi),%ymm7 -#PQCLEAN_DILITHIUM3_AVX2_csubq +#cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq) vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm5,%ymm5 diff --git a/crypto_sign/dilithium3/META.yml b/crypto_sign/dilithium3/META.yml index 4e8548a4..f624d7ef 100644 --- a/crypto_sign/dilithium3/META.yml +++ b/crypto_sign/dilithium3/META.yml @@ -23,6 +23,7 @@ implementations: supported_platforms: - architecture: x86_64 operating_systems: + - Darwin - Linux required_flags: - avx2 diff --git a/crypto_sign/dilithium3/avx2/Makefile b/crypto_sign/dilithium3/avx2/Makefile index 3438ba44..1982bd7d 100644 --- a/crypto_sign/dilithium3/avx2/Makefile +++ b/crypto_sign/dilithium3/avx2/Makefile @@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) $(AR) -r $@ $^ diff --git a/crypto_sign/dilithium3/avx2/cdecl.inc b/crypto_sign/dilithium3/avx2/cdecl.inc new file mode 100644 index 00000000..6c9e5ac1 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/cdecl.inc @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL +#define PQCLEAN_DILITHIUM3_AVX2_CDECL + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_sign/dilithium4/avx2/invntt.s b/crypto_sign/dilithium3/avx2/invntt.S similarity index 85% rename from crypto_sign/dilithium4/avx2/invntt.s rename to crypto_sign/dilithium3/avx2/invntt.S index 8f69a004..6588e5ef 100644 --- a/crypto_sign/dilithium4/avx2/invntt.s +++ b/crypto_sign/dilithium3/avx2/invntt.S @@ -1,4 +1,5 @@ .include "shuffle.inc" +#include "cdecl.inc" .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 vpaddd %ymm2,%ymm\l0,%ymm12 @@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2 vpsrlq $32,%ymm\h3,%ymm\h3 .endm -.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx -PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm6 @@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9 vpsrlq $32,%ymm11,%ymm11 level1: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) vpmovzxdq 64(%rdx),%ymm15 vpmovzxdq 80(%rdx),%ymm3 butterfly 4,5,8,9,6,7,10,11 level2: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) vpmovzxdq 96(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 @@ -130,7 +131,7 @@ shuffle4 8,9,6,9 shuffle4 10,11,8,11 level3: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 116(%rdx),%ymm15 vpblendd $0xF0,%ymm15,%ymm14,%ymm10 @@ -144,7 +145,7 @@ shuffle8 5,7,6,7 shuffle8 9,11,5,11 level4: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) vpbroadcastd 120(%rdx),%ymm9 butterfly 10,3,6,5,4,8,7,11,9,9 @@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx -PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9 vpsrlq $32,%ymm11,%ymm11 level6: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) vpbroadcastd 16(%rdx),%ymm15 vpbroadcastd 20(%rdx),%ymm3 butterfly 4,5,8,9,6,7,10,11 level7: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) vpbroadcastd 24(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm5,%ymm5 @@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 vpsrlq $32,%ymm7,%ymm7 #store -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_mask(%rip),%ymm3 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm6,%ymm3,%ymm6 diff --git a/crypto_sign/dilithium3/avx2/ntt.s b/crypto_sign/dilithium3/avx2/ntt.S similarity index 83% rename from crypto_sign/dilithium3/avx2/ntt.s rename to crypto_sign/dilithium3/avx2/ntt.S index 6a17915d..db959478 100644 --- a/crypto_sign/dilithium3/avx2/ntt.s +++ b/crypto_sign/dilithium3/avx2/ntt.S @@ -1,4 +1,5 @@ .include "shuffle.inc" +#include "cdecl.inc" .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 #mul @@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2 vpsubd %ymm15,%ymm\rh3,%ymm\rh3 .endm -.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx -PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 level0: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#zetas vpbroadcastd (%rdx),%ymm3 #load @@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi) ret -.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx -PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10 vmovdqa 224(%rsi),%ymm11 level3: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#zetas vpbroadcastd (%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11 @@ -128,7 +129,7 @@ shuffle8 7,11,6,11 butterfly 3,8,4,9,5,10,6,11,12,12,12,12 level5: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#zetas vpmovzxdq 12(%rdx),%ymm12 shuffle4 3,5,7,5 @@ -139,14 +140,14 @@ shuffle4 9,11,4,11 butterfly 7,5,3,10,8,6,4,11,12,12,12,12 level6: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#zetas vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 44(%rdx),%ymm13 butterfly 7,5,8,6,3,10,4,11,12,12,13,13 level7: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#zetas vpmovzxdq 60(%rdx),%ymm12 vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 92(%rdx),%ymm14 diff --git a/crypto_sign/dilithium3/avx2/pointwise.S b/crypto_sign/dilithium3/avx2/pointwise.S index 320a91d8..4aca6373 100644 --- a/crypto_sign/dilithium3/avx2/pointwise.S +++ b/crypto_sign/dilithium3/avx2/pointwise.S @@ -1,10 +1,11 @@ #include "params.h" +#include "cdecl.inc" -.global PQCLEAN_DILITHIUM3_AVX2_pointwise_avx -PQCLEAN_DILITHIUM3_AVX2_pointwise_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop1: @@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4 vpaddq %ymm9,%ymm5,%ymm5 .endm -.global PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx -PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop2: diff --git a/crypto_sign/dilithium4/avx2/reduce.s b/crypto_sign/dilithium3/avx2/reduce.S similarity index 80% rename from crypto_sign/dilithium4/avx2/reduce.s rename to crypto_sign/dilithium3/avx2/reduce.S index b4c4a567..1847274f 100644 --- a/crypto_sign/dilithium4/avx2/reduce.s +++ b/crypto_sign/dilithium3/avx2/reduce.S @@ -1,7 +1,9 @@ -.global PQCLEAN_DILITHIUM4_AVX2_reduce_avx -PQCLEAN_DILITHIUM4_AVX2_reduce_avx: +#include "cdecl.inc" + +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x23ones(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: @@ -46,10 +48,10 @@ jb _looptop_rdc32 ret -.global PQCLEAN_DILITHIUM4_AVX2_csubq_avx -PQCLEAN_DILITHIUM4_AVX2_csubq_avx: +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: @@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3 vmovdqa 64(%rdi),%ymm5 vmovdqa 96(%rdi),%ymm7 -#PQCLEAN_DILITHIUM4_AVX2_csubq +#cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq) vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm5,%ymm5 diff --git a/crypto_sign/dilithium4/META.yml b/crypto_sign/dilithium4/META.yml index 822003b8..c90fba93 100644 --- a/crypto_sign/dilithium4/META.yml +++ b/crypto_sign/dilithium4/META.yml @@ -23,6 +23,7 @@ implementations: supported_platforms: - architecture: x86_64 operating_systems: + - Darwin - Linux required_flags: - avx2 diff --git a/crypto_sign/dilithium4/avx2/Makefile b/crypto_sign/dilithium4/avx2/Makefile index c0023b57..a09c5c81 100644 --- a/crypto_sign/dilithium4/avx2/Makefile +++ b/crypto_sign/dilithium4/avx2/Makefile @@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) $(AR) -r $@ $^ diff --git a/crypto_sign/dilithium4/avx2/cdecl.inc b/crypto_sign/dilithium4/avx2/cdecl.inc new file mode 100644 index 00000000..01dc4734 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/cdecl.inc @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL +#define PQCLEAN_DILITHIUM4_AVX2_CDECL + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_sign/dilithium3/avx2/invntt.s b/crypto_sign/dilithium4/avx2/invntt.S similarity index 85% rename from crypto_sign/dilithium3/avx2/invntt.s rename to crypto_sign/dilithium4/avx2/invntt.S index a522abe1..2e8a4c02 100644 --- a/crypto_sign/dilithium3/avx2/invntt.s +++ b/crypto_sign/dilithium4/avx2/invntt.S @@ -1,4 +1,5 @@ .include "shuffle.inc" +#include "cdecl.inc" .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 vpaddd %ymm2,%ymm\l0,%ymm12 @@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2 vpsrlq $32,%ymm\h3,%ymm\h3 .endm -.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx -PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm6 @@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9 vpsrlq $32,%ymm11,%ymm11 level1: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) vpmovzxdq 64(%rdx),%ymm15 vpmovzxdq 80(%rdx),%ymm3 butterfly 4,5,8,9,6,7,10,11 level2: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) vpmovzxdq 96(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 @@ -130,7 +131,7 @@ shuffle4 8,9,6,9 shuffle4 10,11,8,11 level3: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) vpbroadcastd 112(%rdx),%ymm14 vpbroadcastd 116(%rdx),%ymm15 vpblendd $0xF0,%ymm15,%ymm14,%ymm10 @@ -144,7 +145,7 @@ shuffle8 5,7,6,7 shuffle8 9,11,5,11 level4: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) vpbroadcastd 120(%rdx),%ymm9 butterfly 10,3,6,5,4,8,7,11,9,9 @@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx -PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9 vpsrlq $32,%ymm11,%ymm11 level6: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) vpbroadcastd 16(%rdx),%ymm15 vpbroadcastd 20(%rdx),%ymm3 butterfly 4,5,8,9,6,7,10,11 level7: -#PQCLEAN_DILITHIUM3_AVX2_zetas +#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) vpbroadcastd 24(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 #consts -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm5,%ymm5 @@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 vpsrlq $32,%ymm7,%ymm7 #store -vmovdqa _PQCLEAN_DILITHIUM3_AVX2_mask(%rip),%ymm3 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm6,%ymm3,%ymm6 diff --git a/crypto_sign/dilithium4/avx2/ntt.s b/crypto_sign/dilithium4/avx2/ntt.S similarity index 83% rename from crypto_sign/dilithium4/avx2/ntt.s rename to crypto_sign/dilithium4/avx2/ntt.S index 692398c5..f76fc616 100644 --- a/crypto_sign/dilithium4/avx2/ntt.s +++ b/crypto_sign/dilithium4/avx2/ntt.S @@ -1,4 +1,5 @@ .include "shuffle.inc" +#include "cdecl.inc" .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 #mul @@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2 vpsubd %ymm15,%ymm\rh3,%ymm\rh3 .endm -.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx -PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 level0: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#zetas vpbroadcastd (%rdx),%ymm3 #load @@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi) ret -.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx -PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10 vmovdqa 224(%rsi),%ymm11 level3: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#zetas vpbroadcastd (%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11 @@ -128,7 +129,7 @@ shuffle8 7,11,6,11 butterfly 3,8,4,9,5,10,6,11,12,12,12,12 level5: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#zetas vpmovzxdq 12(%rdx),%ymm12 shuffle4 3,5,7,5 @@ -139,14 +140,14 @@ shuffle4 9,11,4,11 butterfly 7,5,3,10,8,6,4,11,12,12,12,12 level6: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#zetas vpmovzxdq 28(%rdx),%ymm12 vpmovzxdq 44(%rdx),%ymm13 butterfly 7,5,8,6,3,10,4,11,12,12,13,13 level7: -#PQCLEAN_DILITHIUM4_AVX2_zetas +#zetas vpmovzxdq 60(%rdx),%ymm12 vpmovzxdq 76(%rdx),%ymm13 vpmovzxdq 92(%rdx),%ymm14 diff --git a/crypto_sign/dilithium4/avx2/pointwise.S b/crypto_sign/dilithium4/avx2/pointwise.S index 672820c1..9f61dd58 100644 --- a/crypto_sign/dilithium4/avx2/pointwise.S +++ b/crypto_sign/dilithium4/avx2/pointwise.S @@ -1,10 +1,11 @@ #include "params.h" +#include "cdecl.inc" -.global PQCLEAN_DILITHIUM4_AVX2_pointwise_avx -PQCLEAN_DILITHIUM4_AVX2_pointwise_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop1: @@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4 vpaddq %ymm9,%ymm5,%ymm5 .endm -.global PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx -PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 -vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop2: diff --git a/crypto_sign/dilithium2/avx2/reduce.s b/crypto_sign/dilithium4/avx2/reduce.S similarity index 80% rename from crypto_sign/dilithium2/avx2/reduce.s rename to crypto_sign/dilithium4/avx2/reduce.S index 85a9eb1c..c6d226f5 100644 --- a/crypto_sign/dilithium2/avx2/reduce.s +++ b/crypto_sign/dilithium4/avx2/reduce.S @@ -1,7 +1,9 @@ -.global PQCLEAN_DILITHIUM2_AVX2_reduce_avx -PQCLEAN_DILITHIUM2_AVX2_reduce_avx: +#include "cdecl.inc" + +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x23ones(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: @@ -46,10 +48,10 @@ jb _looptop_rdc32 ret -.global PQCLEAN_DILITHIUM2_AVX2_csubq_avx -PQCLEAN_DILITHIUM2_AVX2_csubq_avx: +.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx) +cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx): #consts -vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm0 +vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: @@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3 vmovdqa 64(%rdi),%ymm5 vmovdqa 96(%rdi),%ymm7 -#PQCLEAN_DILITHIUM2_AVX2_csubq +#cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq) vpsubd %ymm0,%ymm1,%ymm1 vpsubd %ymm0,%ymm3,%ymm3 vpsubd %ymm0,%ymm5,%ymm5