mirror of
https://github.com/henrydcase/pqc.git
synced 2024-11-22 23:48:58 +00:00
dilithium: Remove leading underscore from some internal symbols
This commit is contained in:
parent
351d17ae70
commit
3db4fa4876
@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm6
|
vmovdqa (%rsi),%ymm6
|
||||||
@ -165,9 +165,9 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
|
|||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
|
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
|
||||||
|
|
||||||
vpmuludq %ymm3,%ymm4,%ymm4
|
vpmuludq %ymm3,%ymm4,%ymm4
|
||||||
vpmuludq %ymm3,%ymm5,%ymm5
|
vpmuludq %ymm3,%ymm5,%ymm5
|
||||||
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
|||||||
vpsrlq $32,%ymm7,%ymm7
|
vpsrlq $32,%ymm7,%ymm7
|
||||||
|
|
||||||
#store
|
#store
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
|
||||||
vpermd %ymm4,%ymm3,%ymm4
|
vpermd %ymm4,%ymm3,%ymm4
|
||||||
vpermd %ymm5,%ymm3,%ymm5
|
vpermd %ymm5,%ymm3,%ymm5
|
||||||
vpermd %ymm6,%ymm3,%ymm6
|
vpermd %ymm6,%ymm3,%ymm6
|
||||||
|
@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
level0:
|
level0:
|
||||||
#zetas
|
#zetas
|
||||||
@ -95,9 +95,9 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
|
@ -5,19 +5,19 @@
|
|||||||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
||||||
|
|
||||||
|
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||||
256 * Q
|
256 * Q
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||||
0x7FFFFF, 0x7FFFFF
|
0x7FFFFF, 0x7FFFFF
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||||
|
|
||||||
#undef QINV
|
#undef QINV
|
||||||
#undef MONT
|
#undef MONT
|
||||||
|
@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
|
|||||||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
||||||
|
|
||||||
|
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv;
|
||||||
|
|
||||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
|
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
|
||||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;
|
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop1:
|
_looptop1:
|
||||||
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop2:
|
_looptop2:
|
||||||
|
@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
|
|||||||
**************************************************/
|
**************************************************/
|
||||||
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
||||||
__m256i vec0, vec1;
|
__m256i vec0, vec1;
|
||||||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
|
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
|
||||||
|
|
||||||
for (size_t i = 0; i < N / 8; i++) {
|
for (size_t i = 0; i < N / 8; i++) {
|
||||||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_rdc32:
|
_looptop_rdc32:
|
||||||
@ -51,7 +51,7 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
|
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
|
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_csubq:
|
_looptop_csubq:
|
||||||
|
@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm6
|
vmovdqa (%rsi),%ymm6
|
||||||
@ -165,9 +165,9 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
|
|||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
|
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
|
||||||
|
|
||||||
vpmuludq %ymm3,%ymm4,%ymm4
|
vpmuludq %ymm3,%ymm4,%ymm4
|
||||||
vpmuludq %ymm3,%ymm5,%ymm5
|
vpmuludq %ymm3,%ymm5,%ymm5
|
||||||
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
|||||||
vpsrlq $32,%ymm7,%ymm7
|
vpsrlq $32,%ymm7,%ymm7
|
||||||
|
|
||||||
#store
|
#store
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
|
||||||
vpermd %ymm4,%ymm3,%ymm4
|
vpermd %ymm4,%ymm3,%ymm4
|
||||||
vpermd %ymm5,%ymm3,%ymm5
|
vpermd %ymm5,%ymm3,%ymm5
|
||||||
vpermd %ymm6,%ymm3,%ymm6
|
vpermd %ymm6,%ymm3,%ymm6
|
||||||
|
@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
level0:
|
level0:
|
||||||
#zetas
|
#zetas
|
||||||
@ -95,9 +95,9 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
|
@ -5,19 +5,19 @@
|
|||||||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
||||||
|
|
||||||
|
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||||
256 * Q
|
256 * Q
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||||
0x7FFFFF, 0x7FFFFF
|
0x7FFFFF, 0x7FFFFF
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||||
|
|
||||||
#undef QINV
|
#undef QINV
|
||||||
#undef MONT
|
#undef MONT
|
||||||
|
@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
|
|||||||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
||||||
|
|
||||||
|
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv;
|
||||||
|
|
||||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas;
|
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas;
|
||||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv;
|
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv;
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop1:
|
_looptop1:
|
||||||
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop2:
|
_looptop2:
|
||||||
|
@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
|
|||||||
**************************************************/
|
**************************************************/
|
||||||
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
||||||
__m256i vec0, vec1;
|
__m256i vec0, vec1;
|
||||||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
|
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
|
||||||
|
|
||||||
for (size_t i = 0; i < N / 8; i++) {
|
for (size_t i = 0; i < N / 8; i++) {
|
||||||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_rdc32:
|
_looptop_rdc32:
|
||||||
@ -51,7 +51,7 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
|
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
|
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_csubq:
|
_looptop_csubq:
|
||||||
|
@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm6
|
vmovdqa (%rsi),%ymm6
|
||||||
@ -165,9 +165,9 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
|
|||||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||||
|
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
|
||||||
|
|
||||||
vpmuludq %ymm3,%ymm4,%ymm4
|
vpmuludq %ymm3,%ymm4,%ymm4
|
||||||
vpmuludq %ymm3,%ymm5,%ymm5
|
vpmuludq %ymm3,%ymm5,%ymm5
|
||||||
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
|||||||
vpsrlq $32,%ymm7,%ymm7
|
vpsrlq $32,%ymm7,%ymm7
|
||||||
|
|
||||||
#store
|
#store
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
|
||||||
vpermd %ymm4,%ymm3,%ymm4
|
vpermd %ymm4,%ymm3,%ymm4
|
||||||
vpermd %ymm5,%ymm3,%ymm5
|
vpermd %ymm5,%ymm3,%ymm5
|
||||||
vpermd %ymm6,%ymm3,%ymm6
|
vpermd %ymm6,%ymm3,%ymm6
|
||||||
|
@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
level0:
|
level0:
|
||||||
#zetas
|
#zetas
|
||||||
@ -95,9 +95,9 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||||
|
|
||||||
#load
|
#load
|
||||||
vmovdqa (%rsi),%ymm4
|
vmovdqa (%rsi),%ymm4
|
||||||
|
@ -5,19 +5,19 @@
|
|||||||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
||||||
|
|
||||||
|
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||||
256 * Q
|
256 * Q
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||||
0x7FFFFF, 0x7FFFFF
|
0x7FFFFF, 0x7FFFFF
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||||
|
|
||||||
#undef QINV
|
#undef QINV
|
||||||
#undef MONT
|
#undef MONT
|
||||||
|
@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
|
|||||||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
||||||
|
|
||||||
|
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones;
|
||||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv;
|
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv;
|
||||||
|
|
||||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas;
|
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas;
|
||||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv;
|
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv;
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop1:
|
_looptop1:
|
||||||
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop2:
|
_looptop2:
|
||||||
|
@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
|
|||||||
**************************************************/
|
**************************************************/
|
||||||
void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
||||||
__m256i vec0, vec1;
|
__m256i vec0, vec1;
|
||||||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
|
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
|
||||||
|
|
||||||
for (size_t i = 0; i < N / 8; i++) {
|
for (size_t i = 0; i < N / 8; i++) {
|
||||||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_rdc32:
|
_looptop_rdc32:
|
||||||
@ -51,7 +51,7 @@ ret
|
|||||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
|
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
|
||||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
|
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
|
||||||
#consts
|
#consts
|
||||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
|
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
_looptop_csubq:
|
_looptop_csubq:
|
||||||
|
Loading…
Reference in New Issue
Block a user