@@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3 | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm6 | |||
@@ -165,9 +165,9 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
@@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 | |||
vpmuludq %ymm3,%ymm4,%ymm4 | |||
vpmuludq %ymm3,%ymm5,%ymm5 | |||
@@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 | |||
vpsrlq $32,%ymm7,%ymm7 | |||
#store | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 | |||
vpermd %ymm4,%ymm3,%ymm4 | |||
vpermd %ymm5,%ymm3,%ymm5 | |||
vpermd %ymm6,%ymm3,%ymm6 | |||
@@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||
level0: | |||
#zetas | |||
@@ -95,9 +95,9 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
@@ -5,19 +5,19 @@ | |||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
256 * Q | |||
} | |||
}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
0x7FFFFF, 0x7FFFFF | |||
} | |||
}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
#undef QINV | |||
#undef MONT | |||
@@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t; | |||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv; | |||
@@ -4,8 +4,8 @@ | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
@@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5 | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
@@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) { | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec); | |||
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec); | |||
for (size_t i = 0; i < N / 8; i++) { | |||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]); | |||
@@ -3,7 +3,7 @@ | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_rdc32: | |||
@@ -51,7 +51,7 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_csubq: | |||
@@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3 | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm6 | |||
@@ -165,9 +165,9 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
@@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 | |||
vpmuludq %ymm3,%ymm4,%ymm4 | |||
vpmuludq %ymm3,%ymm5,%ymm5 | |||
@@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 | |||
vpsrlq $32,%ymm7,%ymm7 | |||
#store | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 | |||
vpermd %ymm4,%ymm3,%ymm4 | |||
vpermd %ymm5,%ymm3,%ymm5 | |||
vpermd %ymm6,%ymm3,%ymm6 | |||
@@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 | |||
level0: | |||
#zetas | |||
@@ -95,9 +95,9 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
@@ -5,19 +5,19 @@ | |||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
256 * Q | |||
} | |||
}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
0x7FFFFF, 0x7FFFFF | |||
} | |||
}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
#undef QINV | |||
#undef MONT | |||
@@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t; | |||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv; | |||
@@ -4,8 +4,8 @@ | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
@@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5 | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
@@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) { | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec); | |||
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec); | |||
for (size_t i = 0; i < N / 8; i++) { | |||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]); | |||
@@ -3,7 +3,7 @@ | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_rdc32: | |||
@@ -51,7 +51,7 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_csubq: | |||
@@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3 | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm6 | |||
@@ -165,9 +165,9 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
@@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,3,3 | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 | |||
vpmuludq %ymm3,%ymm4,%ymm4 | |||
vpmuludq %ymm3,%ymm5,%ymm5 | |||
@@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6 | |||
vpsrlq $32,%ymm7,%ymm7 | |||
#store | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 | |||
vpermd %ymm4,%ymm3,%ymm4 | |||
vpermd %ymm5,%ymm3,%ymm5 | |||
vpermd %ymm6,%ymm3,%ymm6 | |||
@@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3 | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 | |||
level0: | |||
#zetas | |||
@@ -95,9 +95,9 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 | |||
#load | |||
vmovdqa (%rsi),%ymm4 | |||
@@ -5,19 +5,19 @@ | |||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, | |||
256 * Q | |||
} | |||
}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, | |||
0x7FFFFF, 0x7FFFFF | |||
} | |||
}; | |||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; | |||
#undef QINV | |||
#undef MONT | |||
@@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t; | |||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones; | |||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas; | |||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv; | |||
@@ -4,8 +4,8 @@ | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
xor %eax,%eax | |||
_looptop1: | |||
@@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5 | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 | |||
xor %eax,%eax | |||
_looptop2: | |||
@@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) { | |||
**************************************************/ | |||
void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec); | |||
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec); | |||
for (size_t i = 0; i < N / 8; i++) { | |||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]); | |||
@@ -3,7 +3,7 @@ | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_rdc32: | |||
@@ -51,7 +51,7 @@ ret | |||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 | |||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 | |||
xor %eax,%eax | |||
_looptop_csubq: | |||