From dd00b7fbd89d373286ea8e474ffdbb171580da8d Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Fri, 30 Oct 2020 18:01:44 -0400 Subject: [PATCH] slightly faster avx2 schoolbook multiplications --- crypto_kem/firesaber/META.yml | 4 +- crypto_kem/firesaber/avx2/poly_mul.c | 1336 ++++++++++++------------- crypto_kem/lightsaber/META.yml | 4 +- crypto_kem/lightsaber/avx2/poly_mul.c | 1336 ++++++++++++------------- crypto_kem/saber/META.yml | 4 +- crypto_kem/saber/avx2/poly_mul.c | 1336 ++++++++++++------------- 6 files changed, 1968 insertions(+), 2052 deletions(-) diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index 24363a85..9e067250 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/firesaber/avx2/poly_mul.c b/crypto_kem/firesaber/avx2/poly_mul.c index d4e37d59..4d4ec959 100644 --- a/crypto_kem/firesaber/avx2/poly_mul.c +++ b/crypto_kem/firesaber/avx2/poly_mul.c @@ -4,701 +4,673 @@ #define L (SABER_N / 64) -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - -static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; +/* 16 word parallel multiply */ +#define mul(a, b) _mm256_mullo_epi16((a), (b)) +/* 16 word parallel multiply and accumulate */ +#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c)) +static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = mul_add(a0, b0, c[0]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[1] = _mm256_add_epi16(temp, c[1]); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - temp = mul_add(a2, b0, temp); - c[2] = _mm256_add_epi16(temp, c[2]); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp = mul_add(a3, b0, temp); - c[3] = _mm256_add_epi16(temp, c[3]); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp = mul_add(a2, b2, temp); - c[4] = _mm256_add_epi16(temp, c[4]); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp = mul_add(a5, b0, temp); - c[5] = _mm256_add_epi16(temp, c[5]); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a4, b2, temp); - c[6] = _mm256_add_epi16(temp, c[6]); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a5, b2, temp); - c[7] = _mm256_add_epi16(temp, c[7]); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a6, b2, temp); - c[8] = _mm256_add_epi16(temp, c[8]); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a7, b2, temp); - c[9] = _mm256_add_epi16(temp, c[9]); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a[8], b2, temp); - c[10] = _mm256_add_epi16(temp, c[10]); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - temp = mul_add(a[9], b2, temp); - c[11] = _mm256_add_epi16(temp, c[11]); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - temp = mul_add(a[10], b2, temp); - c[12] = _mm256_add_epi16(temp, c[12]); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - temp = mul_add(a[11], b2, temp); - c[13] = _mm256_add_epi16(temp, c[13]); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - temp = mul_add(a[12], b2, temp); - c[14] = _mm256_add_epi16(temp, c[14]); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - temp = mul_add(a[13], b2, temp); - c[15] = _mm256_add_epi16(temp, c[15]); - - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - temp = mul_add(a1, b[1], temp); - c[16] = _mm256_add_epi16(temp, c[16]); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - temp = mul_add(a1, b[2], temp); - c[17] = _mm256_add_epi16(temp, c[17]); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - temp = mul_add(a1, b[3], temp); - c[18] = _mm256_add_epi16(temp, c[18]); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - temp = mul_add(a1, b[4], temp); - c[19] = _mm256_add_epi16(temp, c[19]); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - temp = mul_add(a1, b[5], temp); - c[20] = _mm256_add_epi16(temp, c[20]); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - temp = mul_add(a1, b[6], temp); - c[21] = _mm256_add_epi16(temp, c[21]); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - temp = mul_add(a1, b[7], temp); - c[22] = _mm256_add_epi16(temp, c[22]); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - temp = mul_add(a1, b7, temp); - c[23] = _mm256_add_epi16(temp, c[23]); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - temp = mul_add(a1, b6, temp); - c[24] = _mm256_add_epi16(temp, c[24]); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - temp = mul_add(a1, b5, temp); - c[25] = _mm256_add_epi16(temp, c[25]); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - temp = mul_add(a1, b4, temp); - c[26] = _mm256_add_epi16(temp, c[26]); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - temp = mul_add(a1, b3, temp); - c[27] = _mm256_add_epi16(temp, c[27]); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - temp = mul_add(a1, b2, temp); - c[28] = _mm256_add_epi16(temp, c[28]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[29] = _mm256_add_epi16(temp, c[29]); - - c[30] = mul_add(a1, b1, c[30]); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mul(a0, b0); + t0 = mul(a0, b1); + c[1] = mac(a1, b0, t0); + t0 = mul(a0, b2); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[5] = mac(a3, b2, t0); + c[6] = mul(a3, b3); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[9] = mac(a3, b2, t0); + c[10] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[13] = mac(a3, b2, t0); + c[14] = mul(a3, b3); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[17] = mac(a3, b2, t0); + c[18] = mul(a3, b3); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[21] = mac(a3, b2, t0); + c[22] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[25] = mac(a3, b2, t0); + c[26] = mul(a3, b3); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[29] = mac(a3, b2, t0); + c[30] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + c[31] = _mm256_setzero_si256(); } - -static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - +static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = _mm256_mullo_epi16(a0, b0); - - temp = _mm256_mullo_epi16(a0, b1); - c[1] = mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - c[2] = mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c[3] = mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c[4] = mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - c[7] = mul_add(a5, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - c[8] = mul_add(a6, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - c[9] = mul_add(a7, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - c[10] = mul_add(a[8], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - c[11] = mul_add(a[9], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - c[12] = mul_add(a[10], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - c[13] = mul_add(a[11], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - c[14] = mul_add(a[12], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - c[15] = mul_add(a[13], b2, temp); - - // unrolled second triangle - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - c[16] = mul_add(a1, b[1], temp); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - c[17] = mul_add(a1, b[2], temp); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - c[18] = mul_add(a1, b[3], temp); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - c[19] = mul_add(a1, b[4], temp); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - c[20] = mul_add(a1, b[5], temp); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - c[21] = mul_add(a1, b[6], temp); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - c[22] = mul_add(a1, b[7], temp); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - c[23] = mul_add(a1, b7, temp); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - c[24] = mul_add(a1, b6, temp); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - c[25] = mul_add(a1, b5, temp); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - c[26] = mul_add(a1, b4, temp); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - c[27] = mul_add(a1, b3, temp); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - c[28] = mul_add(a1, b2, temp); - - temp = _mm256_mullo_epi16(a0, b1); - c[29] = mul_add(a1, b0, temp); - - c[30] = _mm256_mullo_epi16(a1, b1); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mac(a0, b0, c[0]); + t0 = mac(a0, b1, c[1]); + c[1] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[2]); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[3]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[4]); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[5]); + c[5] = mac(a3, b2, t0); + c[6] = mac(a3, b3, c[6]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[27]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[28]); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[29]); + c[29] = mac(a3, b2, t0); + c[30] = mac(a3, b3, c[30]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); } + static void transpose(__m256i *M) { __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; __m256i temp, temp0, temp1, temp2; @@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co //-----------------Forward transposes ends--------------------------------- if (accumulate == 0) { - schoolbook_avx(vc, va, vb); - schoolbook_avx(vc + 32, va + 16, vb + 16); - schoolbook_avx(vc + 64, va + 32, vb + 32); - schoolbook_avx(vc + 96, va + 48, vb + 48); + schoolbook16x16(vc, va, vb); + schoolbook16x16(vc + 32, va + 16, vb + 16); + schoolbook16x16(vc + 64, va + 32, vb + 32); + schoolbook16x16(vc + 96, va + 48, vb + 48); } else { - schoolbook_avx_acc(vc, va, vb); - schoolbook_avx_acc(vc + 32, va + 16, vb + 16); - schoolbook_avx_acc(vc + 64, va + 32, vb + 32); - schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + schoolbook16x16_acc(vc, va, vb); + schoolbook16x16_acc(vc + 32, va + 16, vb + 16); + schoolbook16x16_acc(vc + 64, va + 32, vb + 32); + schoolbook16x16_acc(vc + 96, va + 48, vb + 48); } } diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index ad9d6acc..ec0f7517 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/avx2/poly_mul.c b/crypto_kem/lightsaber/avx2/poly_mul.c index 9ae8de05..51504491 100644 --- a/crypto_kem/lightsaber/avx2/poly_mul.c +++ b/crypto_kem/lightsaber/avx2/poly_mul.c @@ -4,701 +4,673 @@ #define L (SABER_N / 64) -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - -static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; +/* 16 word parallel multiply */ +#define mul(a, b) _mm256_mullo_epi16((a), (b)) +/* 16 word parallel multiply and accumulate */ +#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c)) +static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = mul_add(a0, b0, c[0]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[1] = _mm256_add_epi16(temp, c[1]); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - temp = mul_add(a2, b0, temp); - c[2] = _mm256_add_epi16(temp, c[2]); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp = mul_add(a3, b0, temp); - c[3] = _mm256_add_epi16(temp, c[3]); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp = mul_add(a2, b2, temp); - c[4] = _mm256_add_epi16(temp, c[4]); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp = mul_add(a5, b0, temp); - c[5] = _mm256_add_epi16(temp, c[5]); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a4, b2, temp); - c[6] = _mm256_add_epi16(temp, c[6]); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a5, b2, temp); - c[7] = _mm256_add_epi16(temp, c[7]); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a6, b2, temp); - c[8] = _mm256_add_epi16(temp, c[8]); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a7, b2, temp); - c[9] = _mm256_add_epi16(temp, c[9]); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a[8], b2, temp); - c[10] = _mm256_add_epi16(temp, c[10]); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - temp = mul_add(a[9], b2, temp); - c[11] = _mm256_add_epi16(temp, c[11]); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - temp = mul_add(a[10], b2, temp); - c[12] = _mm256_add_epi16(temp, c[12]); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - temp = mul_add(a[11], b2, temp); - c[13] = _mm256_add_epi16(temp, c[13]); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - temp = mul_add(a[12], b2, temp); - c[14] = _mm256_add_epi16(temp, c[14]); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - temp = mul_add(a[13], b2, temp); - c[15] = _mm256_add_epi16(temp, c[15]); - - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - temp = mul_add(a1, b[1], temp); - c[16] = _mm256_add_epi16(temp, c[16]); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - temp = mul_add(a1, b[2], temp); - c[17] = _mm256_add_epi16(temp, c[17]); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - temp = mul_add(a1, b[3], temp); - c[18] = _mm256_add_epi16(temp, c[18]); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - temp = mul_add(a1, b[4], temp); - c[19] = _mm256_add_epi16(temp, c[19]); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - temp = mul_add(a1, b[5], temp); - c[20] = _mm256_add_epi16(temp, c[20]); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - temp = mul_add(a1, b[6], temp); - c[21] = _mm256_add_epi16(temp, c[21]); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - temp = mul_add(a1, b[7], temp); - c[22] = _mm256_add_epi16(temp, c[22]); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - temp = mul_add(a1, b7, temp); - c[23] = _mm256_add_epi16(temp, c[23]); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - temp = mul_add(a1, b6, temp); - c[24] = _mm256_add_epi16(temp, c[24]); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - temp = mul_add(a1, b5, temp); - c[25] = _mm256_add_epi16(temp, c[25]); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - temp = mul_add(a1, b4, temp); - c[26] = _mm256_add_epi16(temp, c[26]); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - temp = mul_add(a1, b3, temp); - c[27] = _mm256_add_epi16(temp, c[27]); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - temp = mul_add(a1, b2, temp); - c[28] = _mm256_add_epi16(temp, c[28]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[29] = _mm256_add_epi16(temp, c[29]); - - c[30] = mul_add(a1, b1, c[30]); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mul(a0, b0); + t0 = mul(a0, b1); + c[1] = mac(a1, b0, t0); + t0 = mul(a0, b2); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[5] = mac(a3, b2, t0); + c[6] = mul(a3, b3); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[9] = mac(a3, b2, t0); + c[10] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[13] = mac(a3, b2, t0); + c[14] = mul(a3, b3); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[17] = mac(a3, b2, t0); + c[18] = mul(a3, b3); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[21] = mac(a3, b2, t0); + c[22] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[25] = mac(a3, b2, t0); + c[26] = mul(a3, b3); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[29] = mac(a3, b2, t0); + c[30] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + c[31] = _mm256_setzero_si256(); } - -static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - +static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = _mm256_mullo_epi16(a0, b0); - - temp = _mm256_mullo_epi16(a0, b1); - c[1] = mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - c[2] = mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c[3] = mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c[4] = mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - c[7] = mul_add(a5, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - c[8] = mul_add(a6, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - c[9] = mul_add(a7, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - c[10] = mul_add(a[8], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - c[11] = mul_add(a[9], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - c[12] = mul_add(a[10], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - c[13] = mul_add(a[11], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - c[14] = mul_add(a[12], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - c[15] = mul_add(a[13], b2, temp); - - // unrolled second triangle - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - c[16] = mul_add(a1, b[1], temp); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - c[17] = mul_add(a1, b[2], temp); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - c[18] = mul_add(a1, b[3], temp); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - c[19] = mul_add(a1, b[4], temp); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - c[20] = mul_add(a1, b[5], temp); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - c[21] = mul_add(a1, b[6], temp); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - c[22] = mul_add(a1, b[7], temp); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - c[23] = mul_add(a1, b7, temp); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - c[24] = mul_add(a1, b6, temp); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - c[25] = mul_add(a1, b5, temp); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - c[26] = mul_add(a1, b4, temp); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - c[27] = mul_add(a1, b3, temp); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - c[28] = mul_add(a1, b2, temp); - - temp = _mm256_mullo_epi16(a0, b1); - c[29] = mul_add(a1, b0, temp); - - c[30] = _mm256_mullo_epi16(a1, b1); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mac(a0, b0, c[0]); + t0 = mac(a0, b1, c[1]); + c[1] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[2]); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[3]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[4]); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[5]); + c[5] = mac(a3, b2, t0); + c[6] = mac(a3, b3, c[6]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[27]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[28]); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[29]); + c[29] = mac(a3, b2, t0); + c[30] = mac(a3, b3, c[30]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); } + static void transpose(__m256i *M) { __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; __m256i temp, temp0, temp1, temp2; @@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co //-----------------Forward transposes ends--------------------------------- if (accumulate == 0) { - schoolbook_avx(vc, va, vb); - schoolbook_avx(vc + 32, va + 16, vb + 16); - schoolbook_avx(vc + 64, va + 32, vb + 32); - schoolbook_avx(vc + 96, va + 48, vb + 48); + schoolbook16x16(vc, va, vb); + schoolbook16x16(vc + 32, va + 16, vb + 16); + schoolbook16x16(vc + 64, va + 32, vb + 32); + schoolbook16x16(vc + 96, va + 48, vb + 48); } else { - schoolbook_avx_acc(vc, va, vb); - schoolbook_avx_acc(vc + 32, va + 16, vb + 16); - schoolbook_avx_acc(vc + 64, va + 32, vb + 32); - schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + schoolbook16x16_acc(vc, va, vb); + schoolbook16x16_acc(vc + 32, va + 16, vb + 16); + schoolbook16x16_acc(vc + 64, va + 32, vb + 32); + schoolbook16x16_acc(vc + 96, va + 48, vb + 48); } } diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index f6375c71..742d77c5 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/avx2/poly_mul.c b/crypto_kem/saber/avx2/poly_mul.c index 5ec0aa73..2090e64f 100644 --- a/crypto_kem/saber/avx2/poly_mul.c +++ b/crypto_kem/saber/avx2/poly_mul.c @@ -4,701 +4,673 @@ #define L (SABER_N / 64) -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - -static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; +/* 16 word parallel multiply */ +#define mul(a, b) _mm256_mullo_epi16((a), (b)) +/* 16 word parallel multiply and accumulate */ +#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c)) +static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = mul_add(a0, b0, c[0]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[1] = _mm256_add_epi16(temp, c[1]); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - temp = mul_add(a2, b0, temp); - c[2] = _mm256_add_epi16(temp, c[2]); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp = mul_add(a3, b0, temp); - c[3] = _mm256_add_epi16(temp, c[3]); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp = mul_add(a2, b2, temp); - c[4] = _mm256_add_epi16(temp, c[4]); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp = mul_add(a5, b0, temp); - c[5] = _mm256_add_epi16(temp, c[5]); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a4, b2, temp); - c[6] = _mm256_add_epi16(temp, c[6]); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a5, b2, temp); - c[7] = _mm256_add_epi16(temp, c[7]); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a6, b2, temp); - c[8] = _mm256_add_epi16(temp, c[8]); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a7, b2, temp); - c[9] = _mm256_add_epi16(temp, c[9]); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a[8], b2, temp); - c[10] = _mm256_add_epi16(temp, c[10]); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - temp = mul_add(a[9], b2, temp); - c[11] = _mm256_add_epi16(temp, c[11]); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - temp = mul_add(a[10], b2, temp); - c[12] = _mm256_add_epi16(temp, c[12]); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - temp = mul_add(a[11], b2, temp); - c[13] = _mm256_add_epi16(temp, c[13]); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - temp = mul_add(a[12], b2, temp); - c[14] = _mm256_add_epi16(temp, c[14]); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - temp = mul_add(a[13], b2, temp); - c[15] = _mm256_add_epi16(temp, c[15]); - - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - temp = mul_add(a1, b[1], temp); - c[16] = _mm256_add_epi16(temp, c[16]); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - temp = mul_add(a1, b[2], temp); - c[17] = _mm256_add_epi16(temp, c[17]); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - temp = mul_add(a1, b[3], temp); - c[18] = _mm256_add_epi16(temp, c[18]); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - temp = mul_add(a1, b[4], temp); - c[19] = _mm256_add_epi16(temp, c[19]); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - temp = mul_add(a1, b[5], temp); - c[20] = _mm256_add_epi16(temp, c[20]); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - temp = mul_add(a1, b[6], temp); - c[21] = _mm256_add_epi16(temp, c[21]); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - temp = mul_add(a1, b[7], temp); - c[22] = _mm256_add_epi16(temp, c[22]); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - temp = mul_add(a1, b7, temp); - c[23] = _mm256_add_epi16(temp, c[23]); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - temp = mul_add(a1, b6, temp); - c[24] = _mm256_add_epi16(temp, c[24]); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - temp = mul_add(a1, b5, temp); - c[25] = _mm256_add_epi16(temp, c[25]); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - temp = mul_add(a1, b4, temp); - c[26] = _mm256_add_epi16(temp, c[26]); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - temp = mul_add(a1, b3, temp); - c[27] = _mm256_add_epi16(temp, c[27]); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - temp = mul_add(a1, b2, temp); - c[28] = _mm256_add_epi16(temp, c[28]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[29] = _mm256_add_epi16(temp, c[29]); - - c[30] = mul_add(a1, b1, c[30]); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mul(a0, b0); + t0 = mul(a0, b1); + c[1] = mac(a1, b0, t0); + t0 = mul(a0, b2); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[5] = mac(a3, b2, t0); + c[6] = mul(a3, b3); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[9] = mac(a3, b2, t0); + c[10] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[13] = mac(a3, b2, t0); + c[14] = mul(a3, b3); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[17] = mac(a3, b2, t0); + c[18] = mul(a3, b3); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[21] = mac(a3, b2, t0); + c[22] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[25] = mac(a3, b2, t0); + c[26] = mul(a3, b3); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[29] = mac(a3, b2, t0); + c[30] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + c[31] = _mm256_setzero_si256(); } - -static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - +static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = _mm256_mullo_epi16(a0, b0); - - temp = _mm256_mullo_epi16(a0, b1); - c[1] = mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - c[2] = mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c[3] = mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c[4] = mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - c[7] = mul_add(a5, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - c[8] = mul_add(a6, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - c[9] = mul_add(a7, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - c[10] = mul_add(a[8], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - c[11] = mul_add(a[9], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - c[12] = mul_add(a[10], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - c[13] = mul_add(a[11], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - c[14] = mul_add(a[12], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - c[15] = mul_add(a[13], b2, temp); - - // unrolled second triangle - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - c[16] = mul_add(a1, b[1], temp); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - c[17] = mul_add(a1, b[2], temp); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - c[18] = mul_add(a1, b[3], temp); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - c[19] = mul_add(a1, b[4], temp); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - c[20] = mul_add(a1, b[5], temp); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - c[21] = mul_add(a1, b[6], temp); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - c[22] = mul_add(a1, b[7], temp); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - c[23] = mul_add(a1, b7, temp); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - c[24] = mul_add(a1, b6, temp); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - c[25] = mul_add(a1, b5, temp); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - c[26] = mul_add(a1, b4, temp); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - c[27] = mul_add(a1, b3, temp); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - c[28] = mul_add(a1, b2, temp); - - temp = _mm256_mullo_epi16(a0, b1); - c[29] = mul_add(a1, b0, temp); - - c[30] = _mm256_mullo_epi16(a1, b1); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mac(a0, b0, c[0]); + t0 = mac(a0, b1, c[1]); + c[1] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[2]); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[3]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[4]); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[5]); + c[5] = mac(a3, b2, t0); + c[6] = mac(a3, b3, c[6]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[27]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[28]); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[29]); + c[29] = mac(a3, b2, t0); + c[30] = mac(a3, b3, c[30]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); } + static void transpose(__m256i *M) { __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; __m256i temp, temp0, temp1, temp2; @@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co //-----------------Forward transposes ends--------------------------------- if (accumulate == 0) { - schoolbook_avx(vc, va, vb); - schoolbook_avx(vc + 32, va + 16, vb + 16); - schoolbook_avx(vc + 64, va + 32, vb + 32); - schoolbook_avx(vc + 96, va + 48, vb + 48); + schoolbook16x16(vc, va, vb); + schoolbook16x16(vc + 32, va + 16, vb + 16); + schoolbook16x16(vc + 64, va + 32, vb + 32); + schoolbook16x16(vc + 96, va + 48, vb + 48); } else { - schoolbook_avx_acc(vc, va, vb); - schoolbook_avx_acc(vc + 32, va + 16, vb + 16); - schoolbook_avx_acc(vc + 64, va + 32, vb + 32); - schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + schoolbook16x16_acc(vc, va, vb); + schoolbook16x16_acc(vc + 32, va + 16, vb + 16); + schoolbook16x16_acc(vc + 64, va + 32, vb + 32); + schoolbook16x16_acc(vc + 96, va + 48, vb + 48); } }