From 52851284abfec5275602ca6a84bd2cbbf5de1651 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Tue, 9 Feb 2021 11:07:00 -0500 Subject: [PATCH] dilithium/fips202x4: align _mm_storel_pd result using double on stack --- crypto_sign/dilithium2/avx2/fips202x4.c | 13 +++++++++---- crypto_sign/dilithium3/avx2/fips202x4.c | 13 +++++++++---- crypto_sign/dilithium5/avx2/fips202x4.c | 13 +++++++++---- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/crypto_sign/dilithium2/avx2/fips202x4.c b/crypto_sign/dilithium2/avx2/fips202x4.c index 53e72945..f923a043 100644 --- a/crypto_sign/dilithium2/avx2/fips202x4.c +++ b/crypto_sign/dilithium2/avx2/fips202x4.c @@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0, unsigned int r, __m256i s[25]) { unsigned int i; + double temp0, temp1; __m128d t; while (nblocks > 0) { PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); for (i = 0; i < r / 8; ++i) { t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - _mm_storel_pd((double *)&out0[8 * i], t); - _mm_storeh_pd((double *)&out1[8 * i], t); + _mm_storel_pd(&temp0, t); + _mm_storeh_pd(&temp1, t); + memmove(&out0[8 * i], &temp0, sizeof(double)); + memmove(&out1[8 * i], &temp1, sizeof(double)); t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); - _mm_storel_pd((double *)&out2[8 * i], t); - _mm_storeh_pd((double *)&out3[8 * i], t); + _mm_storel_pd(&temp0, t); + _mm_storeh_pd(&temp1, t); + memmove(&out2[8 * i], &temp0, sizeof(double)); + memmove(&out3[8 * i], &temp1, sizeof(double)); } out0 += r; diff --git a/crypto_sign/dilithium3/avx2/fips202x4.c b/crypto_sign/dilithium3/avx2/fips202x4.c index bb4a3767..1f9eda81 100644 --- a/crypto_sign/dilithium3/avx2/fips202x4.c +++ b/crypto_sign/dilithium3/avx2/fips202x4.c @@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0, unsigned int r, __m256i s[25]) { unsigned int i; + double temp0, temp1; __m128d t; while (nblocks > 0) { PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); for (i = 0; i < r / 8; ++i) { t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - _mm_storel_pd((double *)&out0[8 * i], t); - _mm_storeh_pd((double *)&out1[8 * i], t); + _mm_storel_pd(&temp0, t); + _mm_storeh_pd(&temp1, t); + memmove(&out0[8 * i], &temp0, sizeof(double)); + memmove(&out1[8 * i], &temp1, sizeof(double)); t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); - _mm_storel_pd((double *)&out2[8 * i], t); - _mm_storeh_pd((double *)&out3[8 * i], t); + _mm_storel_pd(&temp0, t); + _mm_storeh_pd(&temp1, t); + memmove(&out2[8 * i], &temp0, sizeof(double)); + memmove(&out3[8 * i], &temp1, sizeof(double)); } out0 += r; diff --git a/crypto_sign/dilithium5/avx2/fips202x4.c b/crypto_sign/dilithium5/avx2/fips202x4.c index 6636b507..89abef7b 100644 --- a/crypto_sign/dilithium5/avx2/fips202x4.c +++ b/crypto_sign/dilithium5/avx2/fips202x4.c @@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0, unsigned int r, __m256i s[25]) { unsigned int i; + double temp0, temp1; __m128d t; while (nblocks > 0) { PQCLEAN_DILITHIUM5_AVX2_f1600x4(s, KeccakF_RoundConstants); for (i = 0; i < r / 8; ++i) { t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - _mm_storel_pd((double *)&out0[8 * i], t); - _mm_storeh_pd((double *)&out1[8 * i], t); + _mm_storel_pd(&temp0, t); + _mm_storeh_pd(&temp1, t); + memmove(&out0[8 * i], &temp0, sizeof(double)); + memmove(&out1[8 * i], &temp1, sizeof(double)); t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); - _mm_storel_pd((double *)&out2[8 * i], t); - _mm_storeh_pd((double *)&out3[8 * i], t); + _mm_storel_pd(&temp0, t); + _mm_storeh_pd(&temp1, t); + memmove(&out2[8 * i], &temp0, sizeof(double)); + memmove(&out3[8 * i], &temp1, sizeof(double)); } out0 += r;