@@ -23,6 +23,7 @@ implementations: | |||||
supported_platforms: | supported_platforms: | ||||
- architecture: x86_64 | - architecture: x86_64 | ||||
operating_systems: | operating_systems: | ||||
- Darwin | |||||
- Linux | - Linux | ||||
required_flags: | required_flags: | ||||
- avx2 | - avx2 | ||||
@@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) | |||||
%.o: %.c $(HEADERS) | %.o: %.c $(HEADERS) | ||||
$(CC) $(CFLAGS) -c -o $@ $< | $(CC) $(CFLAGS) -c -o $@ $< | ||||
%.o: %.s $(HEADERS) | |||||
$(AS) -o $@ $< | |||||
%.o: %.S $(HEADERS) | %.o: %.S $(HEADERS) | ||||
$(AS) -c -o $@ $< | |||||
$(CC) -c -o $@ $< | |||||
$(LIB): $(OBJECTS) $(KECCAK4X) | $(LIB): $(OBJECTS) $(KECCAK4X) | ||||
$(AR) -r $@ $^ | $(AR) -r $@ $^ | ||||
@@ -0,0 +1,18 @@ | |||||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL | |||||
#define PQCLEAN_DILITHIUM2_AVX2_CDECL | |||||
/* The C ABI on MacOS exports all symbols with a leading | |||||
* underscore. This means that any symbols we refer to from | |||||
* C files (functions) can't be found, and all symbols we | |||||
* refer to from ASM also can't be found (nttconsts.c). | |||||
* | |||||
* This define helps us get around this | |||||
*/ | |||||
#if defined(__WIN32__) || defined(__APPLE__) | |||||
#define cdecl(s) _##s | |||||
#else | |||||
#define cdecl(s) s | |||||
#endif | |||||
#endif |
@@ -1,4 +1,5 @@ | |||||
.include "shuffle.inc" | .include "shuffle.inc" | ||||
#include "cdecl.inc" | |||||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | ||||
vpaddd %ymm2,%ymm\l0,%ymm12 | vpaddd %ymm2,%ymm\l0,%ymm12 | ||||
@@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2 | |||||
vpsrlq $32,%ymm\h3,%ymm\h3 | vpsrlq $32,%ymm\h3,%ymm\h3 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm6 | vmovdqa (%rsi),%ymm6 | ||||
@@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9 | |||||
vpsrlq $32,%ymm11,%ymm11 | vpsrlq $32,%ymm11,%ymm11 | ||||
level1: | level1: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||||
vpmovzxdq 64(%rdx),%ymm15 | vpmovzxdq 64(%rdx),%ymm15 | ||||
vpmovzxdq 80(%rdx),%ymm3 | vpmovzxdq 80(%rdx),%ymm3 | ||||
butterfly 4,5,8,9,6,7,10,11 | butterfly 4,5,8,9,6,7,10,11 | ||||
level2: | level2: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||||
vpmovzxdq 96(%rdx),%ymm3 | vpmovzxdq 96(%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11,3,3 | butterfly 4,5,6,7,8,9,10,11,3,3 | ||||
@@ -130,7 +131,7 @@ shuffle4 8,9,6,9 | |||||
shuffle4 10,11,8,11 | shuffle4 10,11,8,11 | ||||
level3: | level3: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||||
vpbroadcastd 112(%rdx),%ymm14 | vpbroadcastd 112(%rdx),%ymm14 | ||||
vpbroadcastd 116(%rdx),%ymm15 | vpbroadcastd 116(%rdx),%ymm15 | ||||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | ||||
@@ -144,7 +145,7 @@ shuffle8 5,7,6,7 | |||||
shuffle8 9,11,5,11 | shuffle8 9,11,5,11 | ||||
level4: | level4: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||||
vpbroadcastd 120(%rdx),%ymm9 | vpbroadcastd 120(%rdx),%ymm9 | ||||
butterfly 10,3,6,5,4,8,7,11,9,9 | butterfly 10,3,6,5,4,8,7,11,9,9 | ||||
@@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi) | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm4 | vmovdqa (%rsi),%ymm4 | ||||
@@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9 | |||||
vpsrlq $32,%ymm11,%ymm11 | vpsrlq $32,%ymm11,%ymm11 | ||||
level6: | level6: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||||
vpbroadcastd 16(%rdx),%ymm15 | vpbroadcastd 16(%rdx),%ymm15 | ||||
vpbroadcastd 20(%rdx),%ymm3 | vpbroadcastd 20(%rdx),%ymm3 | ||||
butterfly 4,5,8,9,6,7,10,11 | butterfly 4,5,8,9,6,7,10,11 | ||||
level7: | level7: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) | |||||
vpbroadcastd 24(%rdx),%ymm3 | vpbroadcastd 24(%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11,3,3 | butterfly 4,5,6,7,8,9,10,11,3,3 | ||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 | |||||
vpmuludq %ymm3,%ymm4,%ymm4 | vpmuludq %ymm3,%ymm4,%ymm4 | ||||
vpmuludq %ymm3,%ymm5,%ymm5 | vpmuludq %ymm3,%ymm5,%ymm5 | ||||
@@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 | |||||
vpsrlq $32,%ymm7,%ymm7 | vpsrlq $32,%ymm7,%ymm7 | ||||
#store | #store | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_mask(%rip),%ymm3 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 | |||||
vpermd %ymm4,%ymm3,%ymm4 | vpermd %ymm4,%ymm3,%ymm4 | ||||
vpermd %ymm5,%ymm3,%ymm5 | vpermd %ymm5,%ymm3,%ymm5 | ||||
vpermd %ymm6,%ymm3,%ymm6 | vpermd %ymm6,%ymm3,%ymm6 |
@@ -1,4 +1,5 @@ | |||||
.include "shuffle.inc" | .include "shuffle.inc" | ||||
#include "cdecl.inc" | |||||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | ||||
#mul | #mul | ||||
@@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2 | |||||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||||
level0: | level0: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#zetas | |||||
vpbroadcastd (%rdx),%ymm3 | vpbroadcastd (%rdx),%ymm3 | ||||
#load | #load | ||||
@@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi) | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm4 | vmovdqa (%rsi),%ymm4 | ||||
@@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10 | |||||
vmovdqa 224(%rsi),%ymm11 | vmovdqa 224(%rsi),%ymm11 | ||||
level3: | level3: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#zetas | |||||
vpbroadcastd (%rdx),%ymm3 | vpbroadcastd (%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11 | butterfly 4,5,6,7,8,9,10,11 | ||||
@@ -128,7 +129,7 @@ shuffle8 7,11,6,11 | |||||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | ||||
level5: | level5: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 12(%rdx),%ymm12 | vpmovzxdq 12(%rdx),%ymm12 | ||||
shuffle4 3,5,7,5 | shuffle4 3,5,7,5 | ||||
@@ -139,14 +140,14 @@ shuffle4 9,11,4,11 | |||||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | ||||
level6: | level6: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 28(%rdx),%ymm12 | vpmovzxdq 28(%rdx),%ymm12 | ||||
vpmovzxdq 44(%rdx),%ymm13 | vpmovzxdq 44(%rdx),%ymm13 | ||||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | ||||
level7: | level7: | ||||
#PQCLEAN_DILITHIUM2_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 60(%rdx),%ymm12 | vpmovzxdq 60(%rdx),%ymm12 | ||||
vpmovzxdq 76(%rdx),%ymm13 | vpmovzxdq 76(%rdx),%ymm13 | ||||
vpmovzxdq 92(%rdx),%ymm14 | vpmovzxdq 92(%rdx),%ymm14 |
@@ -1,10 +1,11 @@ | |||||
#include "params.h" | #include "params.h" | ||||
#include "cdecl.inc" | |||||
.global PQCLEAN_DILITHIUM2_AVX2_pointwise_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_pointwise_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop1: | _looptop1: | ||||
@@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4 | |||||
vpaddq %ymm9,%ymm5,%ymm5 | vpaddq %ymm9,%ymm5,%ymm5 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop2: | _looptop2: | ||||
@@ -1,7 +1,9 @@ | |||||
.global PQCLEAN_DILITHIUM3_AVX2_reduce_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_reduce_avx: | |||||
#include "cdecl.inc" | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x23ones(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop_rdc32: | _looptop_rdc32: | ||||
@@ -46,10 +48,10 @@ jb _looptop_rdc32 | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM3_AVX2_csubq_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_csubq_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) | |||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop_csubq: | _looptop_csubq: | ||||
@@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3 | |||||
vmovdqa 64(%rdi),%ymm5 | vmovdqa 64(%rdi),%ymm5 | ||||
vmovdqa 96(%rdi),%ymm7 | vmovdqa 96(%rdi),%ymm7 | ||||
#PQCLEAN_DILITHIUM3_AVX2_csubq | |||||
#cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq) | |||||
vpsubd %ymm0,%ymm1,%ymm1 | vpsubd %ymm0,%ymm1,%ymm1 | ||||
vpsubd %ymm0,%ymm3,%ymm3 | vpsubd %ymm0,%ymm3,%ymm3 | ||||
vpsubd %ymm0,%ymm5,%ymm5 | vpsubd %ymm0,%ymm5,%ymm5 |
@@ -23,6 +23,7 @@ implementations: | |||||
supported_platforms: | supported_platforms: | ||||
- architecture: x86_64 | - architecture: x86_64 | ||||
operating_systems: | operating_systems: | ||||
- Darwin | |||||
- Linux | - Linux | ||||
required_flags: | required_flags: | ||||
- avx2 | - avx2 | ||||
@@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) | |||||
%.o: %.c $(HEADERS) | %.o: %.c $(HEADERS) | ||||
$(CC) $(CFLAGS) -c -o $@ $< | $(CC) $(CFLAGS) -c -o $@ $< | ||||
%.o: %.s $(HEADERS) | |||||
$(AS) -o $@ $< | |||||
%.o: %.S $(HEADERS) | %.o: %.S $(HEADERS) | ||||
$(AS) -c -o $@ $< | |||||
$(CC) -c -o $@ $< | |||||
$(LIB): $(OBJECTS) $(KECCAK4X) | $(LIB): $(OBJECTS) $(KECCAK4X) | ||||
$(AR) -r $@ $^ | $(AR) -r $@ $^ | ||||
@@ -0,0 +1,18 @@ | |||||
#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL | |||||
#define PQCLEAN_DILITHIUM3_AVX2_CDECL | |||||
/* The C ABI on MacOS exports all symbols with a leading | |||||
* underscore. This means that any symbols we refer to from | |||||
* C files (functions) can't be found, and all symbols we | |||||
* refer to from ASM also can't be found (nttconsts.c). | |||||
* | |||||
* This define helps us get around this | |||||
*/ | |||||
#if defined(__WIN32__) || defined(__APPLE__) | |||||
#define cdecl(s) _##s | |||||
#else | |||||
#define cdecl(s) s | |||||
#endif | |||||
#endif |
@@ -1,4 +1,5 @@ | |||||
.include "shuffle.inc" | .include "shuffle.inc" | ||||
#include "cdecl.inc" | |||||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | ||||
vpaddd %ymm2,%ymm\l0,%ymm12 | vpaddd %ymm2,%ymm\l0,%ymm12 | ||||
@@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2 | |||||
vpsrlq $32,%ymm\h3,%ymm\h3 | vpsrlq $32,%ymm\h3,%ymm\h3 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm6 | vmovdqa (%rsi),%ymm6 | ||||
@@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9 | |||||
vpsrlq $32,%ymm11,%ymm11 | vpsrlq $32,%ymm11,%ymm11 | ||||
level1: | level1: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) | |||||
vpmovzxdq 64(%rdx),%ymm15 | vpmovzxdq 64(%rdx),%ymm15 | ||||
vpmovzxdq 80(%rdx),%ymm3 | vpmovzxdq 80(%rdx),%ymm3 | ||||
butterfly 4,5,8,9,6,7,10,11 | butterfly 4,5,8,9,6,7,10,11 | ||||
level2: | level2: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) | |||||
vpmovzxdq 96(%rdx),%ymm3 | vpmovzxdq 96(%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11,3,3 | butterfly 4,5,6,7,8,9,10,11,3,3 | ||||
@@ -130,7 +131,7 @@ shuffle4 8,9,6,9 | |||||
shuffle4 10,11,8,11 | shuffle4 10,11,8,11 | ||||
level3: | level3: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) | |||||
vpbroadcastd 112(%rdx),%ymm14 | vpbroadcastd 112(%rdx),%ymm14 | ||||
vpbroadcastd 116(%rdx),%ymm15 | vpbroadcastd 116(%rdx),%ymm15 | ||||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | ||||
@@ -144,7 +145,7 @@ shuffle8 5,7,6,7 | |||||
shuffle8 9,11,5,11 | shuffle8 9,11,5,11 | ||||
level4: | level4: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) | |||||
vpbroadcastd 120(%rdx),%ymm9 | vpbroadcastd 120(%rdx),%ymm9 | ||||
butterfly 10,3,6,5,4,8,7,11,9,9 | butterfly 10,3,6,5,4,8,7,11,9,9 | ||||
@@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi) | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm4 | vmovdqa (%rsi),%ymm4 | ||||
@@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9 | |||||
vpsrlq $32,%ymm11,%ymm11 | vpsrlq $32,%ymm11,%ymm11 | ||||
level6: | level6: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) | |||||
vpbroadcastd 16(%rdx),%ymm15 | vpbroadcastd 16(%rdx),%ymm15 | ||||
vpbroadcastd 20(%rdx),%ymm3 | vpbroadcastd 20(%rdx),%ymm3 | ||||
butterfly 4,5,8,9,6,7,10,11 | butterfly 4,5,8,9,6,7,10,11 | ||||
level7: | level7: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) | |||||
vpbroadcastd 24(%rdx),%ymm3 | vpbroadcastd 24(%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11,3,3 | butterfly 4,5,6,7,8,9,10,11,3,3 | ||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 | |||||
vpmuludq %ymm3,%ymm4,%ymm4 | vpmuludq %ymm3,%ymm4,%ymm4 | ||||
vpmuludq %ymm3,%ymm5,%ymm5 | vpmuludq %ymm3,%ymm5,%ymm5 | ||||
@@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 | |||||
vpsrlq $32,%ymm7,%ymm7 | vpsrlq $32,%ymm7,%ymm7 | ||||
#store | #store | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_mask(%rip),%ymm3 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 | |||||
vpermd %ymm4,%ymm3,%ymm4 | vpermd %ymm4,%ymm3,%ymm4 | ||||
vpermd %ymm5,%ymm3,%ymm5 | vpermd %ymm5,%ymm3,%ymm5 | ||||
vpermd %ymm6,%ymm3,%ymm6 | vpermd %ymm6,%ymm3,%ymm6 |
@@ -1,4 +1,5 @@ | |||||
.include "shuffle.inc" | .include "shuffle.inc" | ||||
#include "cdecl.inc" | |||||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | ||||
#mul | #mul | ||||
@@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2 | |||||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 | |||||
level0: | level0: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#zetas | |||||
vpbroadcastd (%rdx),%ymm3 | vpbroadcastd (%rdx),%ymm3 | ||||
#load | #load | ||||
@@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi) | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm4 | vmovdqa (%rsi),%ymm4 | ||||
@@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10 | |||||
vmovdqa 224(%rsi),%ymm11 | vmovdqa 224(%rsi),%ymm11 | ||||
level3: | level3: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#zetas | |||||
vpbroadcastd (%rdx),%ymm3 | vpbroadcastd (%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11 | butterfly 4,5,6,7,8,9,10,11 | ||||
@@ -128,7 +129,7 @@ shuffle8 7,11,6,11 | |||||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | ||||
level5: | level5: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 12(%rdx),%ymm12 | vpmovzxdq 12(%rdx),%ymm12 | ||||
shuffle4 3,5,7,5 | shuffle4 3,5,7,5 | ||||
@@ -139,14 +140,14 @@ shuffle4 9,11,4,11 | |||||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | ||||
level6: | level6: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 28(%rdx),%ymm12 | vpmovzxdq 28(%rdx),%ymm12 | ||||
vpmovzxdq 44(%rdx),%ymm13 | vpmovzxdq 44(%rdx),%ymm13 | ||||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | ||||
level7: | level7: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 60(%rdx),%ymm12 | vpmovzxdq 60(%rdx),%ymm12 | ||||
vpmovzxdq 76(%rdx),%ymm13 | vpmovzxdq 76(%rdx),%ymm13 | ||||
vpmovzxdq 92(%rdx),%ymm14 | vpmovzxdq 92(%rdx),%ymm14 |
@@ -1,10 +1,11 @@ | |||||
#include "params.h" | #include "params.h" | ||||
#include "cdecl.inc" | |||||
.global PQCLEAN_DILITHIUM3_AVX2_pointwise_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_pointwise_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop1: | _looptop1: | ||||
@@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4 | |||||
vpaddq %ymm9,%ymm5,%ymm5 | vpaddq %ymm9,%ymm5,%ymm5 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop2: | _looptop2: | ||||
@@ -1,7 +1,9 @@ | |||||
.global PQCLEAN_DILITHIUM4_AVX2_reduce_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_reduce_avx: | |||||
#include "cdecl.inc" | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x23ones(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop_rdc32: | _looptop_rdc32: | ||||
@@ -46,10 +48,10 @@ jb _looptop_rdc32 | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM4_AVX2_csubq_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_csubq_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx) | |||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop_csubq: | _looptop_csubq: | ||||
@@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3 | |||||
vmovdqa 64(%rdi),%ymm5 | vmovdqa 64(%rdi),%ymm5 | ||||
vmovdqa 96(%rdi),%ymm7 | vmovdqa 96(%rdi),%ymm7 | ||||
#PQCLEAN_DILITHIUM4_AVX2_csubq | |||||
#cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq) | |||||
vpsubd %ymm0,%ymm1,%ymm1 | vpsubd %ymm0,%ymm1,%ymm1 | ||||
vpsubd %ymm0,%ymm3,%ymm3 | vpsubd %ymm0,%ymm3,%ymm3 | ||||
vpsubd %ymm0,%ymm5,%ymm5 | vpsubd %ymm0,%ymm5,%ymm5 |
@@ -23,6 +23,7 @@ implementations: | |||||
supported_platforms: | supported_platforms: | ||||
- architecture: x86_64 | - architecture: x86_64 | ||||
operating_systems: | operating_systems: | ||||
- Darwin | |||||
- Linux | - Linux | ||||
required_flags: | required_flags: | ||||
- avx2 | - avx2 | ||||
@@ -24,11 +24,8 @@ KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) | |||||
%.o: %.c $(HEADERS) | %.o: %.c $(HEADERS) | ||||
$(CC) $(CFLAGS) -c -o $@ $< | $(CC) $(CFLAGS) -c -o $@ $< | ||||
%.o: %.s $(HEADERS) | |||||
$(AS) -o $@ $< | |||||
%.o: %.S $(HEADERS) | %.o: %.S $(HEADERS) | ||||
$(AS) -c -o $@ $< | |||||
$(CC) -c -o $@ $< | |||||
$(LIB): $(OBJECTS) $(KECCAK4X) | $(LIB): $(OBJECTS) $(KECCAK4X) | ||||
$(AR) -r $@ $^ | $(AR) -r $@ $^ | ||||
@@ -0,0 +1,18 @@ | |||||
#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL | |||||
#define PQCLEAN_DILITHIUM4_AVX2_CDECL | |||||
/* The C ABI on MacOS exports all symbols with a leading | |||||
* underscore. This means that any symbols we refer to from | |||||
* C files (functions) can't be found, and all symbols we | |||||
* refer to from ASM also can't be found (nttconsts.c). | |||||
* | |||||
* This define helps us get around this | |||||
*/ | |||||
#if defined(__WIN32__) || defined(__APPLE__) | |||||
#define cdecl(s) _##s | |||||
#else | |||||
#define cdecl(s) s | |||||
#endif | |||||
#endif |
@@ -1,4 +1,5 @@ | |||||
.include "shuffle.inc" | .include "shuffle.inc" | ||||
#include "cdecl.inc" | |||||
.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | .macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 | ||||
vpaddd %ymm2,%ymm\l0,%ymm12 | vpaddd %ymm2,%ymm\l0,%ymm12 | ||||
@@ -41,12 +42,12 @@ vpsrlq $32,%ymm\h2,%ymm\h2 | |||||
vpsrlq $32,%ymm\h3,%ymm\h3 | vpsrlq $32,%ymm\h3,%ymm\h3 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm6 | vmovdqa (%rsi),%ymm6 | ||||
@@ -111,14 +112,14 @@ vpsrlq $32,%ymm9,%ymm9 | |||||
vpsrlq $32,%ymm11,%ymm11 | vpsrlq $32,%ymm11,%ymm11 | ||||
level1: | level1: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) | |||||
vpmovzxdq 64(%rdx),%ymm15 | vpmovzxdq 64(%rdx),%ymm15 | ||||
vpmovzxdq 80(%rdx),%ymm3 | vpmovzxdq 80(%rdx),%ymm3 | ||||
butterfly 4,5,8,9,6,7,10,11 | butterfly 4,5,8,9,6,7,10,11 | ||||
level2: | level2: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) | |||||
vpmovzxdq 96(%rdx),%ymm3 | vpmovzxdq 96(%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11,3,3 | butterfly 4,5,6,7,8,9,10,11,3,3 | ||||
@@ -130,7 +131,7 @@ shuffle4 8,9,6,9 | |||||
shuffle4 10,11,8,11 | shuffle4 10,11,8,11 | ||||
level3: | level3: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) | |||||
vpbroadcastd 112(%rdx),%ymm14 | vpbroadcastd 112(%rdx),%ymm14 | ||||
vpbroadcastd 116(%rdx),%ymm15 | vpbroadcastd 116(%rdx),%ymm15 | ||||
vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | vpblendd $0xF0,%ymm15,%ymm14,%ymm10 | ||||
@@ -144,7 +145,7 @@ shuffle8 5,7,6,7 | |||||
shuffle8 9,11,5,11 | shuffle8 9,11,5,11 | ||||
level4: | level4: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) | |||||
vpbroadcastd 120(%rdx),%ymm9 | vpbroadcastd 120(%rdx),%ymm9 | ||||
butterfly 10,3,6,5,4,8,7,11,9,9 | butterfly 10,3,6,5,4,8,7,11,9,9 | ||||
@@ -161,12 +162,12 @@ vmovdqa %ymm11,224(%rdi) | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx | |||||
PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm4 | vmovdqa (%rsi),%ymm4 | ||||
@@ -223,20 +224,20 @@ vpsrlq $32,%ymm9,%ymm9 | |||||
vpsrlq $32,%ymm11,%ymm11 | vpsrlq $32,%ymm11,%ymm11 | ||||
level6: | level6: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) | |||||
vpbroadcastd 16(%rdx),%ymm15 | vpbroadcastd 16(%rdx),%ymm15 | ||||
vpbroadcastd 20(%rdx),%ymm3 | vpbroadcastd 20(%rdx),%ymm3 | ||||
butterfly 4,5,8,9,6,7,10,11 | butterfly 4,5,8,9,6,7,10,11 | ||||
level7: | level7: | ||||
#PQCLEAN_DILITHIUM3_AVX2_zetas | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) | |||||
vpbroadcastd 24(%rdx),%ymm3 | vpbroadcastd 24(%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11,3,3 | butterfly 4,5,6,7,8,9,10,11,3,3 | ||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 | |||||
vpmuludq %ymm3,%ymm4,%ymm4 | vpmuludq %ymm3,%ymm4,%ymm4 | ||||
vpmuludq %ymm3,%ymm5,%ymm5 | vpmuludq %ymm3,%ymm5,%ymm5 | ||||
@@ -260,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 | |||||
vpsrlq $32,%ymm7,%ymm7 | vpsrlq $32,%ymm7,%ymm7 | ||||
#store | #store | ||||
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_mask(%rip),%ymm3 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 | |||||
vpermd %ymm4,%ymm3,%ymm4 | vpermd %ymm4,%ymm3,%ymm4 | ||||
vpermd %ymm5,%ymm3,%ymm5 | vpermd %ymm5,%ymm3,%ymm5 | ||||
vpermd %ymm6,%ymm3,%ymm6 | vpermd %ymm6,%ymm3,%ymm6 |
@@ -1,4 +1,5 @@ | |||||
.include "shuffle.inc" | .include "shuffle.inc" | ||||
#include "cdecl.inc" | |||||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 | ||||
#mul | #mul | ||||
@@ -40,15 +41,15 @@ vpsubd %ymm14,%ymm\rh2,%ymm\rh2 | |||||
vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 | |||||
level0: | level0: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#zetas | |||||
vpbroadcastd (%rdx),%ymm3 | vpbroadcastd (%rdx),%ymm3 | ||||
#load | #load | ||||
@@ -91,12 +92,12 @@ vmovdqa %ymm11,1792(%rdi) | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 | |||||
#load | #load | ||||
vmovdqa (%rsi),%ymm4 | vmovdqa (%rsi),%ymm4 | ||||
@@ -109,7 +110,7 @@ vmovdqa 192(%rsi),%ymm10 | |||||
vmovdqa 224(%rsi),%ymm11 | vmovdqa 224(%rsi),%ymm11 | ||||
level3: | level3: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#zetas | |||||
vpbroadcastd (%rdx),%ymm3 | vpbroadcastd (%rdx),%ymm3 | ||||
butterfly 4,5,6,7,8,9,10,11 | butterfly 4,5,6,7,8,9,10,11 | ||||
@@ -128,7 +129,7 @@ shuffle8 7,11,6,11 | |||||
butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | butterfly 3,8,4,9,5,10,6,11,12,12,12,12 | ||||
level5: | level5: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 12(%rdx),%ymm12 | vpmovzxdq 12(%rdx),%ymm12 | ||||
shuffle4 3,5,7,5 | shuffle4 3,5,7,5 | ||||
@@ -139,14 +140,14 @@ shuffle4 9,11,4,11 | |||||
butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | butterfly 7,5,3,10,8,6,4,11,12,12,12,12 | ||||
level6: | level6: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 28(%rdx),%ymm12 | vpmovzxdq 28(%rdx),%ymm12 | ||||
vpmovzxdq 44(%rdx),%ymm13 | vpmovzxdq 44(%rdx),%ymm13 | ||||
butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | butterfly 7,5,8,6,3,10,4,11,12,12,13,13 | ||||
level7: | level7: | ||||
#PQCLEAN_DILITHIUM4_AVX2_zetas | |||||
#zetas | |||||
vpmovzxdq 60(%rdx),%ymm12 | vpmovzxdq 60(%rdx),%ymm12 | ||||
vpmovzxdq 76(%rdx),%ymm13 | vpmovzxdq 76(%rdx),%ymm13 | ||||
vpmovzxdq 92(%rdx),%ymm14 | vpmovzxdq 92(%rdx),%ymm14 |
@@ -1,10 +1,11 @@ | |||||
#include "params.h" | #include "params.h" | ||||
#include "cdecl.inc" | |||||
.global PQCLEAN_DILITHIUM4_AVX2_pointwise_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_pointwise_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop1: | _looptop1: | ||||
@@ -132,11 +133,11 @@ vpaddq %ymm8,%ymm4,%ymm4 | |||||
vpaddq %ymm9,%ymm5,%ymm5 | vpaddq %ymm9,%ymm5,%ymm5 | ||||
.endm | .endm | ||||
.global PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx | |||||
PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 | |||||
vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop2: | _looptop2: | ||||
@@ -1,7 +1,9 @@ | |||||
.global PQCLEAN_DILITHIUM2_AVX2_reduce_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_reduce_avx: | |||||
#include "cdecl.inc" | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x23ones(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop_rdc32: | _looptop_rdc32: | ||||
@@ -46,10 +48,10 @@ jb _looptop_rdc32 | |||||
ret | ret | ||||
.global PQCLEAN_DILITHIUM2_AVX2_csubq_avx | |||||
PQCLEAN_DILITHIUM2_AVX2_csubq_avx: | |||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx) | |||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx): | |||||
#consts | #consts | ||||
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm0 | |||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 | |||||
xor %eax,%eax | xor %eax,%eax | ||||
_looptop_csubq: | _looptop_csubq: | ||||
@@ -59,7 +61,7 @@ vmovdqa 32(%rdi),%ymm3 | |||||
vmovdqa 64(%rdi),%ymm5 | vmovdqa 64(%rdi),%ymm5 | ||||
vmovdqa 96(%rdi),%ymm7 | vmovdqa 96(%rdi),%ymm7 | ||||
#PQCLEAN_DILITHIUM2_AVX2_csubq | |||||
#cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq) | |||||
vpsubd %ymm0,%ymm1,%ymm1 | vpsubd %ymm0,%ymm1,%ymm1 | ||||
vpsubd %ymm0,%ymm3,%ymm3 | vpsubd %ymm0,%ymm3,%ymm3 | ||||
vpsubd %ymm0,%ymm5,%ymm5 | vpsubd %ymm0,%ymm5,%ymm5 |