@@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0, | |||||
unsigned int r, | unsigned int r, | ||||
__m256i s[25]) { | __m256i s[25]) { | ||||
unsigned int i; | unsigned int i; | ||||
double temp0, temp1; | |||||
__m128d t; | __m128d t; | ||||
while (nblocks > 0) { | while (nblocks > 0) { | ||||
PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); | PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); | ||||
for (i = 0; i < r / 8; ++i) { | for (i = 0; i < r / 8; ++i) { | ||||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | ||||
_mm_storel_pd((double *)&out0[8 * i], t); | |||||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||||
_mm_storel_pd(&temp0, t); | |||||
_mm_storeh_pd(&temp1, t); | |||||
memmove(&out0[8 * i], &temp0, sizeof(double)); | |||||
memmove(&out1[8 * i], &temp1, sizeof(double)); | |||||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | ||||
_mm_storel_pd((double *)&out2[8 * i], t); | |||||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||||
_mm_storel_pd(&temp0, t); | |||||
_mm_storeh_pd(&temp1, t); | |||||
memmove(&out2[8 * i], &temp0, sizeof(double)); | |||||
memmove(&out3[8 * i], &temp1, sizeof(double)); | |||||
} | } | ||||
out0 += r; | out0 += r; | ||||
@@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0, | |||||
unsigned int r, | unsigned int r, | ||||
__m256i s[25]) { | __m256i s[25]) { | ||||
unsigned int i; | unsigned int i; | ||||
double temp0, temp1; | |||||
__m128d t; | __m128d t; | ||||
while (nblocks > 0) { | while (nblocks > 0) { | ||||
PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); | PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); | ||||
for (i = 0; i < r / 8; ++i) { | for (i = 0; i < r / 8; ++i) { | ||||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | ||||
_mm_storel_pd((double *)&out0[8 * i], t); | |||||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||||
_mm_storel_pd(&temp0, t); | |||||
_mm_storeh_pd(&temp1, t); | |||||
memmove(&out0[8 * i], &temp0, sizeof(double)); | |||||
memmove(&out1[8 * i], &temp1, sizeof(double)); | |||||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | ||||
_mm_storel_pd((double *)&out2[8 * i], t); | |||||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||||
_mm_storel_pd(&temp0, t); | |||||
_mm_storeh_pd(&temp1, t); | |||||
memmove(&out2[8 * i], &temp0, sizeof(double)); | |||||
memmove(&out3[8 * i], &temp1, sizeof(double)); | |||||
} | } | ||||
out0 += r; | out0 += r; | ||||
@@ -91,17 +91,22 @@ static void keccakx4_squeezeblocks(uint8_t *out0, | |||||
unsigned int r, | unsigned int r, | ||||
__m256i s[25]) { | __m256i s[25]) { | ||||
unsigned int i; | unsigned int i; | ||||
double temp0, temp1; | |||||
__m128d t; | __m128d t; | ||||
while (nblocks > 0) { | while (nblocks > 0) { | ||||
PQCLEAN_DILITHIUM5_AVX2_f1600x4(s, KeccakF_RoundConstants); | PQCLEAN_DILITHIUM5_AVX2_f1600x4(s, KeccakF_RoundConstants); | ||||
for (i = 0; i < r / 8; ++i) { | for (i = 0; i < r / 8; ++i) { | ||||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | ||||
_mm_storel_pd((double *)&out0[8 * i], t); | |||||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||||
_mm_storel_pd(&temp0, t); | |||||
_mm_storeh_pd(&temp1, t); | |||||
memmove(&out0[8 * i], &temp0, sizeof(double)); | |||||
memmove(&out1[8 * i], &temp1, sizeof(double)); | |||||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | ||||
_mm_storel_pd((double *)&out2[8 * i], t); | |||||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||||
_mm_storel_pd(&temp0, t); | |||||
_mm_storeh_pd(&temp1, t); | |||||
memmove(&out2[8 * i], &temp0, sizeof(double)); | |||||
memmove(&out3[8 * i], &temp1, sizeof(double)); | |||||
} | } | ||||
out0 += r; | out0 += r; | ||||