Browse Source

Merge pull request #335 from jschanck/dilithium

Fix dilithium namespacing issues
master
Thom Wiggers 4 years ago
committed by GitHub
parent
commit
fda6416b35
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 123 additions and 123 deletions
  1. +8
    -8
      crypto_sign/dilithium2/avx2/invntt.S
  2. +6
    -6
      crypto_sign/dilithium2/avx2/ntt.S
  3. +13
    -13
      crypto_sign/dilithium2/avx2/nttconsts.c
  4. +7
    -7
      crypto_sign/dilithium2/avx2/nttconsts.h
  5. +4
    -4
      crypto_sign/dilithium2/avx2/pointwise.S
  6. +1
    -1
      crypto_sign/dilithium2/avx2/poly.c
  7. +2
    -2
      crypto_sign/dilithium2/avx2/reduce.S
  8. +8
    -8
      crypto_sign/dilithium3/avx2/invntt.S
  9. +6
    -6
      crypto_sign/dilithium3/avx2/ntt.S
  10. +13
    -13
      crypto_sign/dilithium3/avx2/nttconsts.c
  11. +7
    -7
      crypto_sign/dilithium3/avx2/nttconsts.h
  12. +4
    -4
      crypto_sign/dilithium3/avx2/pointwise.S
  13. +1
    -1
      crypto_sign/dilithium3/avx2/poly.c
  14. +2
    -2
      crypto_sign/dilithium3/avx2/reduce.S
  15. +8
    -8
      crypto_sign/dilithium4/avx2/invntt.S
  16. +6
    -6
      crypto_sign/dilithium4/avx2/ntt.S
  17. +13
    -13
      crypto_sign/dilithium4/avx2/nttconsts.c
  18. +7
    -7
      crypto_sign/dilithium4/avx2/nttconsts.h
  19. +4
    -4
      crypto_sign/dilithium4/avx2/pointwise.S
  20. +1
    -1
      crypto_sign/dilithium4/avx2/poly.c
  21. +2
    -2
      crypto_sign/dilithium4/avx2/reduce.S

+ 8
- 8
crypto_sign/dilithium2/avx2/invntt.S View File

@@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm6
@@ -165,9 +165,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
@@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3

#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3

vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
@@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7

#store
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6


+ 6
- 6
crypto_sign/dilithium2/avx2/ntt.S View File

@@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2

level0:
#zetas
@@ -95,9 +95,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4


+ 13
- 13
crypto_sign/dilithium2/avx2/nttconsts.c View File

@@ -5,19 +5,19 @@
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)


const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};

#undef QINV
#undef MONT


+ 7
- 7
crypto_sign/dilithium2/avx2/nttconsts.h View File

@@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;


extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv;

extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;


+ 4
- 4
crypto_sign/dilithium2/avx2/pointwise.S View File

@@ -4,8 +4,8 @@
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1

xor %eax,%eax
_looptop1:
@@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1

xor %eax,%eax
_looptop2:


+ 1
- 1
crypto_sign/dilithium2/avx2/poly.c View File

@@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);

for (size_t i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);


+ 2
- 2
crypto_sign/dilithium2/avx2/reduce.S View File

@@ -3,7 +3,7 @@
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0

xor %eax,%eax
_looptop_rdc32:
@@ -51,7 +51,7 @@ ret
.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0

xor %eax,%eax
_looptop_csubq:


+ 8
- 8
crypto_sign/dilithium3/avx2/invntt.S View File

@@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm6
@@ -165,9 +165,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
@@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3

#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3

vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
@@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7

#store
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6


+ 6
- 6
crypto_sign/dilithium3/avx2/ntt.S View File

@@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2

level0:
#zetas
@@ -95,9 +95,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4


+ 13
- 13
crypto_sign/dilithium3/avx2/nttconsts.c View File

@@ -5,19 +5,19 @@
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)


const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};

#undef QINV
#undef MONT


+ 7
- 7
crypto_sign/dilithium3/avx2/nttconsts.h View File

@@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;


extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv;

extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv;


+ 4
- 4
crypto_sign/dilithium3/avx2/pointwise.S View File

@@ -4,8 +4,8 @@
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1

xor %eax,%eax
_looptop1:
@@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1

xor %eax,%eax
_looptop2:


+ 1
- 1
crypto_sign/dilithium3/avx2/poly.c View File

@@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);

for (size_t i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);


+ 2
- 2
crypto_sign/dilithium3/avx2/reduce.S View File

@@ -3,7 +3,7 @@
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0

xor %eax,%eax
_looptop_rdc32:
@@ -51,7 +51,7 @@ ret
.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0

xor %eax,%eax
_looptop_csubq:


+ 8
- 8
crypto_sign/dilithium4/avx2/invntt.S View File

@@ -45,9 +45,9 @@ vpsrlq $32,%ymm\h3,%ymm\h3
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm6
@@ -165,9 +165,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
@@ -237,7 +237,7 @@ vpbroadcastd 24(%rdx),%ymm3
butterfly 4,5,6,7,8,9,10,11,3,3

#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3

vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
@@ -261,7 +261,7 @@ vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7

#store
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6


+ 6
- 6
crypto_sign/dilithium4/avx2/ntt.S View File

@@ -44,9 +44,9 @@ vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2

level0:
#zetas
@@ -95,9 +95,9 @@ ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4


+ 13
- 13
crypto_sign/dilithium4/avx2/nttconsts.c View File

@@ -5,19 +5,19 @@
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)


const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};

#undef QINV
#undef MONT


+ 7
- 7
crypto_sign/dilithium4/avx2/nttconsts.h View File

@@ -12,13 +12,13 @@ typedef ALIGNED_UINT32(8) aligned_uint32x8_t;
typedef ALIGNED_UINT32(N) aligned_uint32xN_t;


extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones;
extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv;

extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv;


+ 4
- 4
crypto_sign/dilithium4/avx2/pointwise.S View File

@@ -4,8 +4,8 @@
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1

xor %eax,%eax
_looptop1:
@@ -136,8 +136,8 @@ vpaddq %ymm9,%ymm5,%ymm5
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1

xor %eax,%eax
_looptop2:


+ 1
- 1
crypto_sign/dilithium4/avx2/poly.c View File

@@ -82,7 +82,7 @@ void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
**************************************************/
void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);
const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec);

for (size_t i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);


+ 2
- 2
crypto_sign/dilithium4/avx2/reduce.S View File

@@ -3,7 +3,7 @@
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0

xor %eax,%eax
_looptop_rdc32:
@@ -51,7 +51,7 @@ ret
.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx)
cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx):
#consts
vmovdqa cdecl(_PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0
vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0

xor %eax,%eax
_looptop_csubq:


Loading…
Cancel
Save