diff --git a/crypto_sign/dilithium2/avx2/invntt.S b/crypto_sign/dilithium2/avx2/invntt.S index 17cf515a..5a5ee1ca 100644 --- a/crypto_sign/dilithium2/avx2/invntt.S +++ b/crypto_sign/dilithium2/avx2/invntt.S @@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3 .global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm6 @@ -165,9 +165,9 @@ ret .global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm5,%ymm5 @@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 vpsrlq $32,%ymm7,%ymm7 #store -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm6,%ymm3,%ymm6 diff --git a/crypto_sign/dilithium2/avx2/ntt.S b/crypto_sign/dilithium2/avx2/ntt.S index 4110ef4a..1e1f7754 100644 --- a/crypto_sign/dilithium2/avx2/ntt.S +++ b/crypto_sign/dilithium2/avx2/ntt.S @@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3 .global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 level0: #zetas @@ -95,9 +95,9 @@ ret .global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 diff --git a/crypto_sign/dilithium2/avx2/nttconsts.c b/crypto_sign/dilithium2/avx2/nttconsts.c index 7fff311d..a351ab32 100644 --- a/crypto_sign/dilithium2/avx2/nttconsts.c +++ b/crypto_sign/dilithium2/avx2/nttconsts.c @@ -5,19 +5,19 @@ #define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, - 256 * Q - } - }; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, - 0x7FFFFF, 0x7FFFFF - } - }; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, + 256 * Q + } + }; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, + 0x7FFFFF, 0x7FFFFF + } + }; +const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; #undef QINV #undef MONT diff --git a/crypto_sign/dilithium2/avx2/nttconsts.h b/crypto_sign/dilithium2/avx2/nttconsts.h index c2dd6b51..107bdc87 100644 --- a/crypto_sign/dilithium2/avx2/nttconsts.h +++ b/crypto_sign/dilithium2/avx2/nttconsts.h @@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t; typedef ALIGNED_UINT32(N) aligned_uint32xN_t; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv; extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas; extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv; diff --git a/crypto_sign/dilithium2/avx2/pointwise.S b/crypto_sign/dilithium2/avx2/pointwise.S index d0132791..1f638b1e 100644 --- a/crypto_sign/dilithium2/avx2/pointwise.S +++ b/crypto_sign/dilithium2/avx2/pointwise.S @@ -4,8 +4,8 @@ .global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop1: @@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5 .global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop2: diff --git a/crypto_sign/dilithium2/avx2/poly.c b/crypto_sign/dilithium2/avx2/poly.c index 24a6ea06..a3f4dfc9 100644 --- a/crypto_sign/dilithium2/avx2/poly.c +++ b/crypto_sign/dilithium2/avx2/poly.c @@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) { **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { __m256i vec0, vec1; - const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec); + const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec); for (size_t i = 0; i < N / 8; i++) { vec0 = _mm256_load_si256(&a->coeffs_x8[i]); diff --git a/crypto_sign/dilithium2/avx2/reduce.S b/crypto_sign/dilithium2/avx2/reduce.S index c02d5973..941828d5 100644 --- a/crypto_sign/dilithium2/avx2/reduce.S +++ b/crypto_sign/dilithium2/avx2/reduce.S @@ -3,7 +3,7 @@ .global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: @@ -51,7 +51,7 @@ ret .global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: diff --git a/crypto_sign/dilithium3/avx2/invntt.S b/crypto_sign/dilithium3/avx2/invntt.S index 6588e5ef..0dbfacf6 100644 --- a/crypto_sign/dilithium3/avx2/invntt.S +++ b/crypto_sign/dilithium3/avx2/invntt.S @@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3 .global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm6 @@ -165,9 +165,9 @@ ret .global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm5,%ymm5 @@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 vpsrlq $32,%ymm7,%ymm7 #store -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm6,%ymm3,%ymm6 diff --git a/crypto_sign/dilithium3/avx2/ntt.S b/crypto_sign/dilithium3/avx2/ntt.S index db959478..4cb18d8b 100644 --- a/crypto_sign/dilithium3/avx2/ntt.S +++ b/crypto_sign/dilithium3/avx2/ntt.S @@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3 .global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 level0: #zetas @@ -95,9 +95,9 @@ ret .global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 diff --git a/crypto_sign/dilithium3/avx2/nttconsts.c b/crypto_sign/dilithium3/avx2/nttconsts.c index 12d9ceac..2bd3b20b 100644 --- a/crypto_sign/dilithium3/avx2/nttconsts.c +++ b/crypto_sign/dilithium3/avx2/nttconsts.c @@ -5,19 +5,19 @@ #define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, - 256 * Q - } - }; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, - 0x7FFFFF, 0x7FFFFF - } - }; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, + 256 * Q + } + }; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, + 0x7FFFFF, 0x7FFFFF + } + }; +const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; #undef QINV #undef MONT diff --git a/crypto_sign/dilithium3/avx2/nttconsts.h b/crypto_sign/dilithium3/avx2/nttconsts.h index ed8df189..caf6945d 100644 --- a/crypto_sign/dilithium3/avx2/nttconsts.h +++ b/crypto_sign/dilithium3/avx2/nttconsts.h @@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t; typedef ALIGNED_UINT32(N) aligned_uint32xN_t; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv; extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas; extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv; diff --git a/crypto_sign/dilithium3/avx2/pointwise.S b/crypto_sign/dilithium3/avx2/pointwise.S index 4aca6373..b0085373 100644 --- a/crypto_sign/dilithium3/avx2/pointwise.S +++ b/crypto_sign/dilithium3/avx2/pointwise.S @@ -4,8 +4,8 @@ .global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop1: @@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5 .global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop2: diff --git a/crypto_sign/dilithium3/avx2/poly.c b/crypto_sign/dilithium3/avx2/poly.c index 3f4223c3..f6876f81 100644 --- a/crypto_sign/dilithium3/avx2/poly.c +++ b/crypto_sign/dilithium3/avx2/poly.c @@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) { **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { __m256i vec0, vec1; - const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec); + const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec); for (size_t i = 0; i < N / 8; i++) { vec0 = _mm256_load_si256(&a->coeffs_x8[i]); diff --git a/crypto_sign/dilithium3/avx2/reduce.S b/crypto_sign/dilithium3/avx2/reduce.S index 1847274f..cef9a7a4 100644 --- a/crypto_sign/dilithium3/avx2/reduce.S +++ b/crypto_sign/dilithium3/avx2/reduce.S @@ -3,7 +3,7 @@ .global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: @@ -51,7 +51,7 @@ ret .global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: diff --git a/crypto_sign/dilithium4/avx2/invntt.S b/crypto_sign/dilithium4/avx2/invntt.S index 2e8a4c02..6c94513b 100644 --- a/crypto_sign/dilithium4/avx2/invntt.S +++ b/crypto_sign/dilithium4/avx2/invntt.S @@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3 .global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm6 @@ -165,9 +165,9 @@ ret .global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 @@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3 butterfly 4,5,6,7,8,9,10,11,3,3 #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 vpmuludq %ymm3,%ymm4,%ymm4 vpmuludq %ymm3,%ymm5,%ymm5 @@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 vpsrlq $32,%ymm7,%ymm7 #store -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 vpermd %ymm4,%ymm3,%ymm4 vpermd %ymm5,%ymm3,%ymm5 vpermd %ymm6,%ymm3,%ymm6 diff --git a/crypto_sign/dilithium4/avx2/ntt.S b/crypto_sign/dilithium4/avx2/ntt.S index f76fc616..b3f499bc 100644 --- a/crypto_sign/dilithium4/avx2/ntt.S +++ b/crypto_sign/dilithium4/avx2/ntt.S @@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3 .global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 level0: #zetas @@ -95,9 +95,9 @@ ret .global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 #load vmovdqa (%rsi),%ymm4 diff --git a/crypto_sign/dilithium4/avx2/nttconsts.c b/crypto_sign/dilithium4/avx2/nttconsts.c index 7bf71ea7..20aa2120 100644 --- a/crypto_sign/dilithium4/avx2/nttconsts.c +++ b/crypto_sign/dilithium4/avx2/nttconsts.c @@ -5,19 +5,19 @@ #define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, - 256 * Q - } - }; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, - 0x7FFFFF, 0x7FFFFF - } - }; -const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, + 256 * Q + } + }; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, + 0x7FFFFF, 0x7FFFFF + } + }; +const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; #undef QINV #undef MONT diff --git a/crypto_sign/dilithium4/avx2/nttconsts.h b/crypto_sign/dilithium4/avx2/nttconsts.h index 1904a67b..5a340bbf 100644 --- a/crypto_sign/dilithium4/avx2/nttconsts.h +++ b/crypto_sign/dilithium4/avx2/nttconsts.h @@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t; typedef ALIGNED_UINT32(N) aligned_uint32xN_t; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones; -extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones; +extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv; extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas; extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv; diff --git a/crypto_sign/dilithium4/avx2/pointwise.S b/crypto_sign/dilithium4/avx2/pointwise.S index 9f61dd58..a9d3ddd3 100644 --- a/crypto_sign/dilithium4/avx2/pointwise.S +++ b/crypto_sign/dilithium4/avx2/pointwise.S @@ -4,8 +4,8 @@ .global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop1: @@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5 .global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 xor %eax,%eax _looptop2: diff --git a/crypto_sign/dilithium4/avx2/poly.c b/crypto_sign/dilithium4/avx2/poly.c index 6fde3258..f5d28f56 100644 --- a/crypto_sign/dilithium4/avx2/poly.c +++ b/crypto_sign/dilithium4/avx2/poly.c @@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) { **************************************************/ void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { __m256i vec0, vec1; - const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec); + const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec); for (size_t i = 0; i < N / 8; i++) { vec0 = _mm256_load_si256(&a->coeffs_x8[i]); diff --git a/crypto_sign/dilithium4/avx2/reduce.S b/crypto_sign/dilithium4/avx2/reduce.S index c6d226f5..ae1dcdad 100644 --- a/crypto_sign/dilithium4/avx2/reduce.S +++ b/crypto_sign/dilithium4/avx2/reduce.S @@ -3,7 +3,7 @@ .global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 xor %eax,%eax _looptop_rdc32: @@ -51,7 +51,7 @@ ret .global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx) cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx): #consts -vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 +vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 xor %eax,%eax _looptop_csubq: