1
1
의 미러 https://github.com/henrydcase/pqc.git synced 2024-11-25 17:01:22 +00:00

dilithium: Remove leading underscore from some internal symbols

This commit is contained in:
John M. Schanck 2020-10-08 09:56:15 -04:00 committed by Kris Kwiatkowski
부모 351d17ae70
커밋 3db4fa4876
21개의 변경된 파일105개의 추가작업 그리고 105개의 파일을 삭제

파일 보기

@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm6
@ -165,9 +165,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7
#store
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6

파일 보기

@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
level0:
#zetas
@ -95,9 +95,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4

파일 보기

@ -5,19 +5,19 @@
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
#undef QINV
#undef MONT

파일 보기

@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;

파일 보기

@ -4,8 +4,8 @@
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax
_looptop1:
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax
_looptop2:

파일 보기

@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
for (size_t i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);

파일 보기

@ -3,7 +3,7 @@
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
xor %eax,%eax
_looptop_rdc32:
@ -51,7 +51,7 @@ ret
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
xor %eax,%eax
_looptop_csubq:

파일 보기

@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm6
@ -165,9 +165,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7
#store
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6

파일 보기

@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
level0:
#zetas
@ -95,9 +95,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4

파일 보기

@ -5,19 +5,19 @@
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
#undef QINV
#undef MONT

파일 보기

@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv;

파일 보기

@ -4,8 +4,8 @@
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax
_looptop1:
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax
_looptop2:

파일 보기

@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
for (size_t i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);

파일 보기

@ -3,7 +3,7 @@
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
xor %eax,%eax
_looptop_rdc32:
@ -51,7 +51,7 @@ ret
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
xor %eax,%eax
_looptop_csubq:

파일 보기

@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm6
@ -165,9 +165,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4
@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7
#store
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6

파일 보기

@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
level0:
#zetas
@ -95,9 +95,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
#load
vmovdqa (%rsi),%ymm4

파일 보기

@ -5,19 +5,19 @@
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
#undef QINV
#undef MONT

파일 보기

@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv;

파일 보기

@ -4,8 +4,8 @@
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax
_looptop1:
@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
xor %eax,%eax
_looptop2:

파일 보기

@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
**************************************************/
void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
for (size_t i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);

파일 보기

@ -3,7 +3,7 @@
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
xor %eax,%eax
_looptop_rdc32:
@ -51,7 +51,7 @@ ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
xor %eax,%eax
_looptop_csubq: