浏览代码

dilithium/fips202x4: align _mm_storel_pd result using double on stack

master^2
John M. Schanck 3 年前
父节点
当前提交
52851284ab
共有 3 个文件被更改,包括 27 次插入12 次删除
  1. +9
    -4
      crypto_sign/dilithium2/avx2/fips202x4.c
  2. +9
    -4
      crypto_sign/dilithium3/avx2/fips202x4.c
  3. +9
    -4
      crypto_sign/dilithium5/avx2/fips202x4.c

+ 9
- 4
crypto_sign/dilithium2/avx2/fips202x4.c 查看文件

@@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0,
unsigned int r,
__m256i s[25]) {
unsigned int i;
double temp0, temp1;
__m128d t;

while (nblocks > 0) {
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
_mm_storel_pd(&temp0, t);
_mm_storeh_pd(&temp1, t);
memmove(&out0[8 * i], &temp0, sizeof(double));
memmove(&out1[8 * i], &temp1, sizeof(double));
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
_mm_storel_pd(&temp0, t);
_mm_storeh_pd(&temp1, t);
memmove(&out2[8 * i], &temp0, sizeof(double));
memmove(&out3[8 * i], &temp1, sizeof(double));
}

out0 += r;


+ 9
- 4
crypto_sign/dilithium3/avx2/fips202x4.c 查看文件

@@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0,
unsigned int r,
__m256i s[25]) {
unsigned int i;
double temp0, temp1;
__m128d t;

while (nblocks > 0) {
PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
_mm_storel_pd(&temp0, t);
_mm_storeh_pd(&temp1, t);
memmove(&out0[8 * i], &temp0, sizeof(double));
memmove(&out1[8 * i], &temp1, sizeof(double));
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
_mm_storel_pd(&temp0, t);
_mm_storeh_pd(&temp1, t);
memmove(&out2[8 * i], &temp0, sizeof(double));
memmove(&out3[8 * i], &temp1, sizeof(double));
}

out0 += r;


+ 9
- 4
crypto_sign/dilithium5/avx2/fips202x4.c 查看文件

@@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0,
unsigned int r,
__m256i s[25]) {
unsigned int i;
double temp0, temp1;
__m128d t;

while (nblocks > 0) {
PQCLEAN_DILITHIUM5_AVX2_f1600x4(s, KeccakF_RoundConstants);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
_mm_storel_pd(&temp0, t);
_mm_storeh_pd(&temp1, t);
memmove(&out0[8 * i], &temp0, sizeof(double));
memmove(&out1[8 * i], &temp1, sizeof(double));
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
_mm_storel_pd(&temp0, t);
_mm_storeh_pd(&temp1, t);
memmove(&out2[8 * i], &temp0, sizeof(double));
memmove(&out3[8 * i], &temp1, sizeof(double));
}

out0 += r;


正在加载...
取消
保存