Merge pull request #335 from jschanck/dilithium
Fix dilithium namespacing issues
This commit is contained in:
commit
fda6416b35
@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm6
|
||||
@ -165,9 +165,9 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm4
|
||||
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
|
||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
|
||||
|
||||
vpmuludq %ymm3,%ymm4,%ymm4
|
||||
vpmuludq %ymm3,%ymm5,%ymm5
|
||||
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
||||
vpsrlq $32,%ymm7,%ymm7
|
||||
|
||||
#store
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
|
||||
vpermd %ymm4,%ymm3,%ymm4
|
||||
vpermd %ymm5,%ymm3,%ymm5
|
||||
vpermd %ymm6,%ymm3,%ymm6
|
||||
|
@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||
|
||||
level0:
|
||||
#zetas
|
||||
@ -95,9 +95,9 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm4
|
||||
|
@ -5,19 +5,19 @@
|
||||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
||||
|
||||
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||
256 * Q
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||
0x7FFFFF, 0x7FFFFF
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||
256 * Q
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||
0x7FFFFF, 0x7FFFFF
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||
|
||||
#undef QINV
|
||||
#undef MONT
|
||||
|
@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
|
||||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
||||
|
||||
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv;
|
||||
|
||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
|
||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;
|
||||
|
@ -4,8 +4,8 @@
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop1:
|
||||
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop2:
|
||||
|
@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
|
||||
**************************************************/
|
||||
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
||||
__m256i vec0, vec1;
|
||||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
|
||||
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
|
||||
|
||||
for (size_t i = 0; i < N / 8; i++) {
|
||||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
||||
|
@ -3,7 +3,7 @@
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop_rdc32:
|
||||
@ -51,7 +51,7 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop_csubq:
|
||||
|
@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm6
|
||||
@ -165,9 +165,9 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm4
|
||||
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
|
||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
|
||||
|
||||
vpmuludq %ymm3,%ymm4,%ymm4
|
||||
vpmuludq %ymm3,%ymm5,%ymm5
|
||||
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
||||
vpsrlq $32,%ymm7,%ymm7
|
||||
|
||||
#store
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
|
||||
vpermd %ymm4,%ymm3,%ymm4
|
||||
vpermd %ymm5,%ymm3,%ymm5
|
||||
vpermd %ymm6,%ymm3,%ymm6
|
||||
|
@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||
|
||||
level0:
|
||||
#zetas
|
||||
@ -95,9 +95,9 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm4
|
||||
|
@ -5,19 +5,19 @@
|
||||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
||||
|
||||
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||
256 * Q
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||
0x7FFFFF, 0x7FFFFF
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||
256 * Q
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||
0x7FFFFF, 0x7FFFFF
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||
|
||||
#undef QINV
|
||||
#undef MONT
|
||||
|
@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
|
||||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
||||
|
||||
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv;
|
||||
|
||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas;
|
||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv;
|
||||
|
@ -4,8 +4,8 @@
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop1:
|
||||
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop2:
|
||||
|
@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
|
||||
**************************************************/
|
||||
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
||||
__m256i vec0, vec1;
|
||||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
|
||||
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
|
||||
|
||||
for (size_t i = 0; i < N / 8; i++) {
|
||||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
||||
|
@ -3,7 +3,7 @@
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop_rdc32:
|
||||
@ -51,7 +51,7 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop_csubq:
|
||||
|
@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm6
|
||||
@ -165,9 +165,9 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm4
|
||||
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
|
||||
butterfly 4,5,6,7,8,9,10,11,3,3
|
||||
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
|
||||
|
||||
vpmuludq %ymm3,%ymm4,%ymm4
|
||||
vpmuludq %ymm3,%ymm5,%ymm5
|
||||
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
|
||||
vpsrlq $32,%ymm7,%ymm7
|
||||
|
||||
#store
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
|
||||
vpermd %ymm4,%ymm3,%ymm4
|
||||
vpermd %ymm5,%ymm3,%ymm5
|
||||
vpermd %ymm6,%ymm3,%ymm6
|
||||
|
@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||
|
||||
level0:
|
||||
#zetas
|
||||
@ -95,9 +95,9 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
|
||||
|
||||
#load
|
||||
vmovdqa (%rsi),%ymm4
|
||||
|
@ -5,19 +5,19 @@
|
||||
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
|
||||
|
||||
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||
256 * Q
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||
0x7FFFFF, 0x7FFFFF
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
|
||||
256 * Q
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
|
||||
0x7FFFFF, 0x7FFFFF
|
||||
}
|
||||
};
|
||||
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
|
||||
|
||||
#undef QINV
|
||||
#undef MONT
|
||||
|
@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
|
||||
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
|
||||
|
||||
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones;
|
||||
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones;
|
||||
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv;
|
||||
|
||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas;
|
||||
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv;
|
||||
|
@ -4,8 +4,8 @@
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop1:
|
||||
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop2:
|
||||
|
@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
|
||||
**************************************************/
|
||||
void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
|
||||
__m256i vec0, vec1;
|
||||
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
|
||||
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
|
||||
|
||||
for (size_t i = 0; i < N / 8; i++) {
|
||||
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
|
||||
|
@ -3,7 +3,7 @@
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop_rdc32:
|
||||
@ -51,7 +51,7 @@ ret
|
||||
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
|
||||
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
|
||||
#consts
|
||||
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
|
||||
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
|
||||
|
||||
xor %eax,%eax
|
||||
_looptop_csubq:
|
||||
|
Ładowanie…
Reference in New Issue
Block a user