1497 lines
50 KiB
C
1497 lines
50 KiB
C
#include "SABER_params.h"
|
|
#include "poly.h"
|
|
|
|
|
|
#define L (SABER_N / 64)
|
|
|
|
/* 16 word parallel multiply */
|
|
#define mul(a, b) _mm256_mullo_epi16((a), (b))
|
|
/* 16 word parallel multiply and accumulate */
|
|
#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c))
|
|
|
|
static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) {
|
|
__m256i a0, a1, a2, a3;
|
|
__m256i b0, b1, b2, b3;
|
|
__m256i t0;
|
|
a0 = a[0];
|
|
a1 = a[1];
|
|
a2 = a[2];
|
|
a3 = a[3];
|
|
b0 = b[0];
|
|
b1 = b[1];
|
|
b2 = b[2];
|
|
b3 = b[3];
|
|
c[0] = mul(a0, b0);
|
|
t0 = mul(a0, b1);
|
|
c[1] = mac(a1, b0, t0);
|
|
t0 = mul(a0, b2);
|
|
t0 = mac(a1, b1, t0);
|
|
c[2] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[3] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[4] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[5] = mac(a3, b2, t0);
|
|
c[6] = mul(a3, b3);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[4] = mac(a0, b0, c[4]);
|
|
t0 = mac(a0, b1, c[5]);
|
|
c[5] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[6]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[6] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[7] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[8] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[9] = mac(a3, b2, t0);
|
|
c[10] = mul(a3, b3);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[8] = mac(a0, b0, c[8]);
|
|
t0 = mac(a0, b1, c[9]);
|
|
c[9] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[10]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[10] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[11] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[12] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[13] = mac(a3, b2, t0);
|
|
c[14] = mul(a3, b3);
|
|
b0 = b[12];
|
|
b1 = b[13];
|
|
b2 = b[14];
|
|
b3 = b[15];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mul(a3, b3);
|
|
a0 = a[4];
|
|
a1 = a[5];
|
|
a2 = a[6];
|
|
a3 = a[7];
|
|
c[16] = mac(a0, b0, c[16]);
|
|
t0 = mac(a0, b1, c[17]);
|
|
c[17] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[18]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[18] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[19] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[20] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[21] = mac(a3, b2, t0);
|
|
c[22] = mul(a3, b3);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[8] = mac(a0, b0, c[8]);
|
|
t0 = mac(a0, b1, c[9]);
|
|
c[9] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[10]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[10] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[11]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[11] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[12]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[12] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[13]);
|
|
c[13] = mac(a3, b2, t0);
|
|
c[14] = mac(a3, b3, c[14]);
|
|
b0 = b[0];
|
|
b1 = b[1];
|
|
b2 = b[2];
|
|
b3 = b[3];
|
|
c[4] = mac(a0, b0, c[4]);
|
|
t0 = mac(a0, b1, c[5]);
|
|
c[5] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[6]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[6] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[7]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[7] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[8]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[8] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[9]);
|
|
c[9] = mac(a3, b2, t0);
|
|
c[10] = mac(a3, b3, c[10]);
|
|
a0 = a[8];
|
|
a1 = a[9];
|
|
a2 = a[10];
|
|
a3 = a[11];
|
|
c[8] = mac(a0, b0, c[8]);
|
|
t0 = mac(a0, b1, c[9]);
|
|
c[9] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[10]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[10] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[11]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[11] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[12]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[12] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[13]);
|
|
c[13] = mac(a3, b2, t0);
|
|
c[14] = mac(a3, b3, c[14]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[16] = mac(a0, b0, c[16]);
|
|
t0 = mac(a0, b1, c[17]);
|
|
c[17] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[18]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[18] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[19]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[19] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[20]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[20] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[21]);
|
|
c[21] = mac(a3, b2, t0);
|
|
c[22] = mac(a3, b3, c[22]);
|
|
b0 = b[12];
|
|
b1 = b[13];
|
|
b2 = b[14];
|
|
b3 = b[15];
|
|
c[20] = mac(a0, b0, c[20]);
|
|
t0 = mac(a0, b1, c[21]);
|
|
c[21] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[22]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[22] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[23] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[24] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[25] = mac(a3, b2, t0);
|
|
c[26] = mul(a3, b3);
|
|
a0 = a[12];
|
|
a1 = a[13];
|
|
a2 = a[14];
|
|
a3 = a[15];
|
|
c[24] = mac(a0, b0, c[24]);
|
|
t0 = mac(a0, b1, c[25]);
|
|
c[25] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[26]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[26] = mac(a2, b0, t0);
|
|
t0 = mul(a0, b3);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[27] = mac(a3, b0, t0);
|
|
t0 = mul(a1, b3);
|
|
t0 = mac(a2, b2, t0);
|
|
c[28] = mac(a3, b1, t0);
|
|
t0 = mul(a2, b3);
|
|
c[29] = mac(a3, b2, t0);
|
|
c[30] = mul(a3, b3);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[20] = mac(a0, b0, c[20]);
|
|
t0 = mac(a0, b1, c[21]);
|
|
c[21] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[22]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[22] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[23]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[23] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[24]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[24] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[25]);
|
|
c[25] = mac(a3, b2, t0);
|
|
c[26] = mac(a3, b3, c[26]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[16] = mac(a0, b0, c[16]);
|
|
t0 = mac(a0, b1, c[17]);
|
|
c[17] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[18]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[18] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[19]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[19] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[20]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[20] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[21]);
|
|
c[21] = mac(a3, b2, t0);
|
|
c[22] = mac(a3, b3, c[22]);
|
|
b0 = b[0];
|
|
b1 = b[1];
|
|
b2 = b[2];
|
|
b3 = b[3];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
c[31] = _mm256_setzero_si256();
|
|
}
|
|
|
|
static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) {
|
|
__m256i a0, a1, a2, a3;
|
|
__m256i b0, b1, b2, b3;
|
|
__m256i t0;
|
|
a0 = a[0];
|
|
a1 = a[1];
|
|
a2 = a[2];
|
|
a3 = a[3];
|
|
b0 = b[0];
|
|
b1 = b[1];
|
|
b2 = b[2];
|
|
b3 = b[3];
|
|
c[0] = mac(a0, b0, c[0]);
|
|
t0 = mac(a0, b1, c[1]);
|
|
c[1] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[2]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[2] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[3]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[3] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[4]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[4] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[5]);
|
|
c[5] = mac(a3, b2, t0);
|
|
c[6] = mac(a3, b3, c[6]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[4] = mac(a0, b0, c[4]);
|
|
t0 = mac(a0, b1, c[5]);
|
|
c[5] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[6]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[6] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[7]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[7] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[8]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[8] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[9]);
|
|
c[9] = mac(a3, b2, t0);
|
|
c[10] = mac(a3, b3, c[10]);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[8] = mac(a0, b0, c[8]);
|
|
t0 = mac(a0, b1, c[9]);
|
|
c[9] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[10]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[10] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[11]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[11] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[12]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[12] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[13]);
|
|
c[13] = mac(a3, b2, t0);
|
|
c[14] = mac(a3, b3, c[14]);
|
|
b0 = b[12];
|
|
b1 = b[13];
|
|
b2 = b[14];
|
|
b3 = b[15];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
a0 = a[4];
|
|
a1 = a[5];
|
|
a2 = a[6];
|
|
a3 = a[7];
|
|
c[16] = mac(a0, b0, c[16]);
|
|
t0 = mac(a0, b1, c[17]);
|
|
c[17] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[18]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[18] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[19]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[19] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[20]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[20] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[21]);
|
|
c[21] = mac(a3, b2, t0);
|
|
c[22] = mac(a3, b3, c[22]);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[8] = mac(a0, b0, c[8]);
|
|
t0 = mac(a0, b1, c[9]);
|
|
c[9] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[10]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[10] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[11]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[11] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[12]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[12] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[13]);
|
|
c[13] = mac(a3, b2, t0);
|
|
c[14] = mac(a3, b3, c[14]);
|
|
b0 = b[0];
|
|
b1 = b[1];
|
|
b2 = b[2];
|
|
b3 = b[3];
|
|
c[4] = mac(a0, b0, c[4]);
|
|
t0 = mac(a0, b1, c[5]);
|
|
c[5] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[6]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[6] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[7]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[7] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[8]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[8] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[9]);
|
|
c[9] = mac(a3, b2, t0);
|
|
c[10] = mac(a3, b3, c[10]);
|
|
a0 = a[8];
|
|
a1 = a[9];
|
|
a2 = a[10];
|
|
a3 = a[11];
|
|
c[8] = mac(a0, b0, c[8]);
|
|
t0 = mac(a0, b1, c[9]);
|
|
c[9] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[10]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[10] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[11]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[11] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[12]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[12] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[13]);
|
|
c[13] = mac(a3, b2, t0);
|
|
c[14] = mac(a3, b3, c[14]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[16] = mac(a0, b0, c[16]);
|
|
t0 = mac(a0, b1, c[17]);
|
|
c[17] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[18]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[18] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[19]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[19] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[20]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[20] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[21]);
|
|
c[21] = mac(a3, b2, t0);
|
|
c[22] = mac(a3, b3, c[22]);
|
|
b0 = b[12];
|
|
b1 = b[13];
|
|
b2 = b[14];
|
|
b3 = b[15];
|
|
c[20] = mac(a0, b0, c[20]);
|
|
t0 = mac(a0, b1, c[21]);
|
|
c[21] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[22]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[22] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[23]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[23] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[24]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[24] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[25]);
|
|
c[25] = mac(a3, b2, t0);
|
|
c[26] = mac(a3, b3, c[26]);
|
|
a0 = a[12];
|
|
a1 = a[13];
|
|
a2 = a[14];
|
|
a3 = a[15];
|
|
c[24] = mac(a0, b0, c[24]);
|
|
t0 = mac(a0, b1, c[25]);
|
|
c[25] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[26]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[26] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[27]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[27] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[28]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[28] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[29]);
|
|
c[29] = mac(a3, b2, t0);
|
|
c[30] = mac(a3, b3, c[30]);
|
|
b0 = b[8];
|
|
b1 = b[9];
|
|
b2 = b[10];
|
|
b3 = b[11];
|
|
c[20] = mac(a0, b0, c[20]);
|
|
t0 = mac(a0, b1, c[21]);
|
|
c[21] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[22]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[22] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[23]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[23] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[24]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[24] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[25]);
|
|
c[25] = mac(a3, b2, t0);
|
|
c[26] = mac(a3, b3, c[26]);
|
|
b0 = b[4];
|
|
b1 = b[5];
|
|
b2 = b[6];
|
|
b3 = b[7];
|
|
c[16] = mac(a0, b0, c[16]);
|
|
t0 = mac(a0, b1, c[17]);
|
|
c[17] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[18]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[18] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[19]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[19] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[20]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[20] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[21]);
|
|
c[21] = mac(a3, b2, t0);
|
|
c[22] = mac(a3, b3, c[22]);
|
|
b0 = b[0];
|
|
b1 = b[1];
|
|
b2 = b[2];
|
|
b3 = b[3];
|
|
c[12] = mac(a0, b0, c[12]);
|
|
t0 = mac(a0, b1, c[13]);
|
|
c[13] = mac(a1, b0, t0);
|
|
t0 = mac(a0, b2, c[14]);
|
|
t0 = mac(a1, b1, t0);
|
|
c[14] = mac(a2, b0, t0);
|
|
t0 = mac(a0, b3, c[15]);
|
|
t0 = mac(a1, b2, t0);
|
|
t0 = mac(a2, b1, t0);
|
|
c[15] = mac(a3, b0, t0);
|
|
t0 = mac(a1, b3, c[16]);
|
|
t0 = mac(a2, b2, t0);
|
|
c[16] = mac(a3, b1, t0);
|
|
t0 = mac(a2, b3, c[17]);
|
|
c[17] = mac(a3, b2, t0);
|
|
c[18] = mac(a3, b3, c[18]);
|
|
}
|
|
|
|
|
|
static void transpose(__m256i *M) {
|
|
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
|
|
__m256i temp, temp0, temp1, temp2;
|
|
|
|
r0 = _mm256_unpacklo_epi16(M[0], M[1]);
|
|
r1 = _mm256_unpacklo_epi16(M[2], M[3]);
|
|
r2 = _mm256_unpacklo_epi16(M[4], M[5]);
|
|
r3 = _mm256_unpacklo_epi16(M[6], M[7]);
|
|
r4 = _mm256_unpacklo_epi16(M[8], M[9]);
|
|
r5 = _mm256_unpacklo_epi16(M[10], M[11]);
|
|
r6 = _mm256_unpacklo_epi16(M[12], M[13]);
|
|
r7 = _mm256_unpacklo_epi16(M[14], M[15]);
|
|
|
|
temp = _mm256_unpacklo_epi32(r0, r1);
|
|
temp0 = _mm256_unpacklo_epi32(r2, r3);
|
|
temp1 = _mm256_unpacklo_epi32(r4, r5);
|
|
temp2 = _mm256_unpacklo_epi32(r6, r7);
|
|
|
|
r8 = _mm256_unpackhi_epi32(r0, r1);
|
|
r9 = _mm256_unpackhi_epi32(r2, r3);
|
|
r10 = _mm256_unpackhi_epi32(r4, r5);
|
|
r11 = _mm256_unpackhi_epi32(r6, r7);
|
|
|
|
r0 = _mm256_unpacklo_epi64(temp, temp0);
|
|
r2 = _mm256_unpackhi_epi64(temp, temp0);
|
|
r1 = _mm256_unpacklo_epi64(temp1, temp2);
|
|
r3 = _mm256_unpackhi_epi64(temp1, temp2);
|
|
|
|
temp = _mm256_unpackhi_epi16(M[0], M[1]);
|
|
temp0 = _mm256_unpackhi_epi16(M[2], M[3]);
|
|
temp1 = _mm256_unpackhi_epi16(M[4], M[5]);
|
|
temp2 = _mm256_unpackhi_epi16(M[6], M[7]);
|
|
|
|
r4 = _mm256_unpackhi_epi16(M[8], M[9]);
|
|
M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
|
|
M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
|
|
M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
|
|
M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
|
|
r5 = _mm256_unpackhi_epi16(M[10], M[11]);
|
|
r6 = _mm256_unpackhi_epi16(M[12], M[13]);
|
|
r7 = _mm256_unpackhi_epi16(M[14], M[15]);
|
|
|
|
r0 = _mm256_unpacklo_epi64(r8, r9);
|
|
r1 = _mm256_unpacklo_epi64(r10, r11);
|
|
r2 = _mm256_unpackhi_epi64(r8, r9);
|
|
r3 = _mm256_unpackhi_epi64(r10, r11);
|
|
|
|
M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
|
|
M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
|
|
M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
|
|
M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
|
|
|
|
r0 = _mm256_unpacklo_epi32(temp, temp0);
|
|
r1 = _mm256_unpacklo_epi32(temp1, temp2);
|
|
r2 = _mm256_unpacklo_epi32(r4, r5);
|
|
r3 = _mm256_unpacklo_epi32(r6, r7);
|
|
|
|
r8 = _mm256_unpacklo_epi64(r0, r1);
|
|
r10 = _mm256_unpackhi_epi64(r0, r1);
|
|
r9 = _mm256_unpacklo_epi64(r2, r3);
|
|
r11 = _mm256_unpackhi_epi64(r2, r3);
|
|
|
|
M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
|
|
M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
|
|
M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
|
|
M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
|
|
|
|
r0 = _mm256_unpackhi_epi32(temp, temp0);
|
|
r1 = _mm256_unpackhi_epi32(temp1, temp2);
|
|
r2 = _mm256_unpackhi_epi32(r4, r5);
|
|
r3 = _mm256_unpackhi_epi32(r6, r7);
|
|
|
|
r4 = _mm256_unpacklo_epi64(r0, r1);
|
|
r6 = _mm256_unpackhi_epi64(r0, r1);
|
|
r5 = _mm256_unpacklo_epi64(r2, r3);
|
|
r7 = _mm256_unpackhi_epi64(r2, r3);
|
|
|
|
M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
|
|
M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
|
|
M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
|
|
M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
|
|
}
|
|
|
|
static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) {
|
|
toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time
|
|
__m256i r0_avx, r1_avx, r2_avx, r3_avx;
|
|
__m256i *va = (__m256i *)a_eval.coeffs;
|
|
__m256i *vb = (__m256i *)b_eval->coeffs;
|
|
__m256i *vc = (__m256i *)c_eval->coeffs;
|
|
|
|
//------------------AVX evaluation for 1st poly-----------------------
|
|
r0_avx = a[0 * L + 0];
|
|
r1_avx = a[0 * L + 1];
|
|
r2_avx = a[0 * L + 2];
|
|
r3_avx = a[0 * L + 3];
|
|
|
|
va[0] = r0_avx;
|
|
va[1] = r1_avx;
|
|
va[2] = r2_avx;
|
|
va[3] = r3_avx;
|
|
va[4] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8] = _mm256_add_epi16(va[6], va[7]);
|
|
//------------------AVX evaluation for 1st poly ends------------------
|
|
|
|
//------------------AVX evaluation for 2nd poly-----------------------
|
|
r0_avx = a[1 * L + 0];
|
|
r1_avx = a[1 * L + 1];
|
|
r2_avx = a[1 * L + 2];
|
|
r3_avx = a[1 * L + 3];
|
|
|
|
va[0 + 9] = r0_avx;
|
|
va[1 + 9] = r1_avx;
|
|
va[2 + 9] = r2_avx;
|
|
va[3 + 9] = r3_avx;
|
|
va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]);
|
|
//------------------AVX evaluation for 2nd poly ends------------------
|
|
|
|
//------------------AVX evaluation for 3rd poly-----------------------
|
|
r0_avx = a[2 * L + 0];
|
|
r1_avx = a[2 * L + 1];
|
|
r2_avx = a[2 * L + 2];
|
|
r3_avx = a[2 * L + 3];
|
|
|
|
va[0 + 18] = r0_avx;
|
|
va[1 + 18] = r1_avx;
|
|
va[2 + 18] = r2_avx;
|
|
va[3 + 18] = r3_avx;
|
|
va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]);
|
|
//------------------AVX evaluation for 3rd poly ends------------------
|
|
|
|
//------------------AVX evaluation for 4th poly-----------------------
|
|
r0_avx = a[3 * L + 0];
|
|
r1_avx = a[3 * L + 1];
|
|
r2_avx = a[3 * L + 2];
|
|
r3_avx = a[3 * L + 3];
|
|
|
|
va[0 + 27] = r0_avx;
|
|
va[1 + 27] = r1_avx;
|
|
va[2 + 27] = r2_avx;
|
|
va[3 + 27] = r3_avx;
|
|
va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]);
|
|
//------------------AVX evaluation for 4th poly ends------------------
|
|
|
|
//------------------AVX evaluation for 5th poly-----------------------
|
|
r0_avx = a[4 * L + 0];
|
|
r1_avx = a[4 * L + 1];
|
|
r2_avx = a[4 * L + 2];
|
|
r3_avx = a[4 * L + 3];
|
|
|
|
va[0 + 36] = r0_avx;
|
|
va[1 + 36] = r1_avx;
|
|
va[2 + 36] = r2_avx;
|
|
va[3 + 36] = r3_avx;
|
|
va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]);
|
|
//------------------AVX evaluation for 5th poly ends------------------
|
|
|
|
//------------------AVX evaluation for 6th poly-----------------------
|
|
r0_avx = a[5 * L + 0];
|
|
r1_avx = a[5 * L + 1];
|
|
r2_avx = a[5 * L + 2];
|
|
r3_avx = a[5 * L + 3];
|
|
|
|
va[0 + 45] = r0_avx;
|
|
va[1 + 45] = r1_avx;
|
|
va[2 + 45] = r2_avx;
|
|
va[3 + 45] = r3_avx;
|
|
va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]);
|
|
//------------------AVX evaluation for 6th poly ends------------------
|
|
|
|
//------------------AVX evaluation for 7th poly-----------------------
|
|
r0_avx = a[6 * L + 0];
|
|
r1_avx = a[6 * L + 1];
|
|
r2_avx = a[6 * L + 2];
|
|
r3_avx = a[6 * L + 3];
|
|
|
|
va[0 + 54] = r0_avx;
|
|
va[1 + 54] = r1_avx;
|
|
va[2 + 54] = r2_avx;
|
|
va[3 + 54] = r3_avx;
|
|
va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]);
|
|
//------------------AVX evaluation for 7th poly ends------------------
|
|
|
|
//-----------------Forward transposes--------------------------------------
|
|
transpose(va);
|
|
transpose(va + 16);
|
|
transpose(va + 32);
|
|
transpose(va + 48);
|
|
//-----------------Forward transposes ends---------------------------------
|
|
|
|
if (accumulate == 0) {
|
|
schoolbook16x16(vc, va, vb);
|
|
schoolbook16x16(vc + 32, va + 16, vb + 16);
|
|
schoolbook16x16(vc + 64, va + 32, vb + 32);
|
|
schoolbook16x16(vc + 96, va + 48, vb + 48);
|
|
} else {
|
|
schoolbook16x16_acc(vc, va, vb);
|
|
schoolbook16x16_acc(vc + 32, va + 16, vb + 16);
|
|
schoolbook16x16_acc(vc + 64, va + 32, vb + 32);
|
|
schoolbook16x16_acc(vc + 96, va + 48, vb + 48);
|
|
}
|
|
}
|
|
|
|
static void karatsuba_eval(__m256i *b_eval, const __m256i *b) {
|
|
__m256i r0_avx, r1_avx, r2_avx, r3_avx;
|
|
|
|
//-------1st poly----------------------------------------------------
|
|
r0_avx = b[0 * L + 0];
|
|
r1_avx = b[0 * L + 1];
|
|
r2_avx = b[0 * L + 2];
|
|
r3_avx = b[0 * L + 3];
|
|
|
|
b_eval[0] = r0_avx;
|
|
b_eval[1] = r1_avx;
|
|
b_eval[2] = r2_avx;
|
|
b_eval[3] = r3_avx;
|
|
b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]);
|
|
|
|
//-------2nd poly----------------------------------------------------
|
|
r0_avx = b[1 * L + 0];
|
|
r1_avx = b[1 * L + 1];
|
|
r2_avx = b[1 * L + 2];
|
|
r3_avx = b[1 * L + 3];
|
|
|
|
b_eval[0 + 9] = r0_avx;
|
|
b_eval[1 + 9] = r1_avx;
|
|
b_eval[2 + 9] = r2_avx;
|
|
b_eval[3 + 9] = r3_avx;
|
|
b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]);
|
|
|
|
//-------3rd poly----------------------------------------------------
|
|
r0_avx = b[2 * L + 0];
|
|
r1_avx = b[2 * L + 1];
|
|
r2_avx = b[2 * L + 2];
|
|
r3_avx = b[2 * L + 3];
|
|
|
|
b_eval[0 + 18] = r0_avx;
|
|
b_eval[1 + 18] = r1_avx;
|
|
b_eval[2 + 18] = r2_avx;
|
|
b_eval[3 + 18] = r3_avx;
|
|
b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]);
|
|
|
|
//-------4th poly----------------------------------------------------
|
|
r0_avx = b[3 * L + 0];
|
|
r1_avx = b[3 * L + 1];
|
|
r2_avx = b[3 * L + 2];
|
|
r3_avx = b[3 * L + 3];
|
|
|
|
b_eval[0 + 27] = r0_avx;
|
|
b_eval[1 + 27] = r1_avx;
|
|
b_eval[2 + 27] = r2_avx;
|
|
b_eval[3 + 27] = r3_avx;
|
|
b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]);
|
|
|
|
//-------5th poly----------------------------------------------------
|
|
r0_avx = b[4 * L + 0];
|
|
r1_avx = b[4 * L + 1];
|
|
r2_avx = b[4 * L + 2];
|
|
r3_avx = b[4 * L + 3];
|
|
|
|
b_eval[0 + 36] = r0_avx;
|
|
b_eval[1 + 36] = r1_avx;
|
|
b_eval[2 + 36] = r2_avx;
|
|
b_eval[3 + 36] = r3_avx;
|
|
b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]);
|
|
|
|
//-------6th poly----------------------------------------------------
|
|
r0_avx = b[5 * L + 0];
|
|
r1_avx = b[5 * L + 1];
|
|
r2_avx = b[5 * L + 2];
|
|
r3_avx = b[5 * L + 3];
|
|
|
|
b_eval[0 + 45] = r0_avx;
|
|
b_eval[1 + 45] = r1_avx;
|
|
b_eval[2 + 45] = r2_avx;
|
|
b_eval[3 + 45] = r3_avx;
|
|
b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]);
|
|
|
|
//-------7th poly----------------------------------------------------
|
|
r0_avx = b[6 * L + 0];
|
|
r1_avx = b[6 * L + 1];
|
|
r2_avx = b[6 * L + 2];
|
|
r3_avx = b[6 * L + 3];
|
|
|
|
b_eval[0 + 54] = r0_avx;
|
|
b_eval[1 + 54] = r1_avx;
|
|
b_eval[2 + 54] = r2_avx;
|
|
b_eval[3 + 54] = r3_avx;
|
|
b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
|
|
b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
|
|
b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
|
|
b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
|
|
b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]);
|
|
|
|
//--------------Evaluating B poly ends-------------------------------
|
|
transpose(b_eval);
|
|
transpose(b_eval + 16);
|
|
transpose(b_eval + 32);
|
|
transpose(b_eval + 48);
|
|
}
|
|
|
|
static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) {
|
|
__m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
|
|
__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
|
|
|
|
//------------------------AVX interpolation for 1st poly external-------------------
|
|
res_avx0 = c_eval[0];
|
|
res_avx2 = c_eval[1];
|
|
res_avx4 = c_eval[2];
|
|
res_avx6 = c_eval[3];
|
|
c6_avx = c_eval[6];
|
|
c7_avx = c_eval[7];
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[16];
|
|
res_avx3 = c_eval[17];
|
|
res_avx5 = c_eval[18];
|
|
res_avx7 = c_eval[19];
|
|
c22_avx = c_eval[22];
|
|
c23_avx = c_eval[23];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final0[0] = res_avx0;
|
|
result_final0[1] = res_avx1;
|
|
result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final0[6] = res_avx6;
|
|
result_final0[7] = res_avx7;
|
|
//------------------------AVX interpolation for 1st poly ends--------------
|
|
|
|
|
|
//------------------------AVX interpolation for 2nd poly external-------------------
|
|
res_avx0 = c_eval[9]; //c_eval0
|
|
res_avx2 = c_eval[10]; //c_eval1
|
|
res_avx4 = c_eval[11]; //c_eval2
|
|
res_avx6 = c_eval[12]; //c_eval3
|
|
c6_avx = c_eval[15]; //c_eval6
|
|
c7_avx = c_eval[32]; //c_eval7
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[25]; //c_eval0
|
|
res_avx3 = c_eval[26]; //c_eval1
|
|
res_avx5 = c_eval[27]; //c_eval2
|
|
res_avx7 = c_eval[28]; //c_eval3
|
|
c22_avx = c_eval[31];
|
|
c23_avx = c_eval[48];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final1[0] = res_avx0;
|
|
result_final1[1] = res_avx1;
|
|
result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final1[6] = res_avx6;
|
|
result_final1[7] = res_avx7;
|
|
//------------------------AVX interpolation for 2nd poly ends--------------
|
|
|
|
//------------------------AVX interpolation for 3rd poly external-------------------
|
|
res_avx0 = c_eval[34]; //c_eval0
|
|
res_avx2 = c_eval[35]; //c_eval1
|
|
res_avx4 = c_eval[36];
|
|
res_avx6 = c_eval[37];
|
|
c6_avx = c_eval[40];
|
|
c7_avx = c_eval[41];
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[50]; //c_eval0
|
|
res_avx3 = c_eval[51]; //c_eval1
|
|
res_avx5 = c_eval[52];
|
|
res_avx7 = c_eval[53];
|
|
c22_avx = c_eval[56];
|
|
c23_avx = c_eval[57];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final2[0] = res_avx0;
|
|
result_final2[1] = res_avx1;
|
|
result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final2[6] = res_avx6;
|
|
result_final2[7] = res_avx7;
|
|
//------------------------AVX interpolation for 3rd poly ends--------------
|
|
|
|
//------------------------AVX interpolation for 4th poly external-------------------
|
|
res_avx0 = c_eval[43];
|
|
res_avx2 = c_eval[44];
|
|
res_avx4 = c_eval[45];
|
|
res_avx6 = c_eval[46];
|
|
c6_avx = c_eval[65];
|
|
c7_avx = c_eval[66];
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[59];
|
|
res_avx3 = c_eval[60];
|
|
res_avx5 = c_eval[61];
|
|
res_avx7 = c_eval[62];
|
|
c22_avx = c_eval[81];
|
|
c23_avx = c_eval[82];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final3[0] = res_avx0;
|
|
result_final3[1] = res_avx1;
|
|
result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final3[6] = res_avx6;
|
|
result_final3[7] = res_avx7;
|
|
//------------------------AVX interpolation for 4th poly ends--------------
|
|
|
|
//------------------------AVX interpolation for 5th poly external-------------------
|
|
res_avx0 = c_eval[68];
|
|
res_avx2 = c_eval[69];
|
|
res_avx4 = c_eval[70];
|
|
res_avx6 = c_eval[71];
|
|
c6_avx = c_eval[74];
|
|
c7_avx = c_eval[75];
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[84];
|
|
res_avx3 = c_eval[85];
|
|
res_avx5 = c_eval[86];
|
|
res_avx7 = c_eval[87];
|
|
c22_avx = c_eval[90];
|
|
c23_avx = c_eval[91];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final4[0] = res_avx0;
|
|
result_final4[1] = res_avx1;
|
|
result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final4[6] = res_avx6;
|
|
result_final4[7] = res_avx7;
|
|
//------------------------AVX interpolation for 5th poly ends--------------
|
|
|
|
//------------------------AVX interpolation for 6th poly external-------------------
|
|
res_avx0 = c_eval[77];
|
|
res_avx2 = c_eval[78];
|
|
res_avx4 = c_eval[79];
|
|
res_avx6 = c_eval[96];
|
|
c6_avx = c_eval[99];
|
|
c7_avx = c_eval[100];
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[93];
|
|
res_avx3 = c_eval[94];
|
|
res_avx5 = c_eval[95];
|
|
res_avx7 = c_eval[112];
|
|
c22_avx = c_eval[115];
|
|
c23_avx = c_eval[116];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final5[0] = res_avx0;
|
|
result_final5[1] = res_avx1;
|
|
result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final5[6] = res_avx6;
|
|
result_final5[7] = res_avx7;
|
|
//------------------------AVX interpolation for 6th poly ends--------------
|
|
|
|
//------------------------AVX interpolation for 7th poly external-------------------
|
|
res_avx0 = c_eval[102];
|
|
res_avx2 = c_eval[103];
|
|
res_avx4 = c_eval[104];
|
|
res_avx6 = c_eval[105];
|
|
c6_avx = c_eval[108];
|
|
c7_avx = c_eval[109];
|
|
|
|
c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx);
|
|
|
|
res_avx1 = c_eval[118];
|
|
res_avx3 = c_eval[119];
|
|
res_avx5 = c_eval[120];
|
|
res_avx7 = c_eval[121];
|
|
c22_avx = c_eval[124];
|
|
c23_avx = c_eval[125];
|
|
|
|
c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7);
|
|
c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx);
|
|
c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6);
|
|
res_avx5 = _mm256_add_epi16(res_avx5, temp);
|
|
temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2);
|
|
res_avx1 = _mm256_add_epi16(res_avx1, temp);
|
|
c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
|
|
res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
|
|
res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
|
|
c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
|
|
c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
|
|
c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
|
|
c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
|
|
c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
|
|
|
|
result_final6[0] = res_avx0;
|
|
result_final6[1] = res_avx1;
|
|
result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx);
|
|
result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx);
|
|
result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx);
|
|
result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx);
|
|
result_final6[6] = res_avx6;
|
|
result_final6[7] = res_avx7;
|
|
//------------------------AVX interpolation for 7th poly ends--------------
|
|
}
|
|
|
|
void PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) {
|
|
size_t i;
|
|
__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
|
|
__m256i aw_avx[7 * L];
|
|
__m256i *va = (__m256i *)a->coeffs;
|
|
|
|
for (i = 0; i < L; i++) {
|
|
r0_avx = va[0 * L + i];
|
|
r1_avx = va[1 * L + i];
|
|
r2_avx = va[2 * L + i];
|
|
r3_avx = va[3 * L + i];
|
|
r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
|
|
r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
|
|
aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
|
|
aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
|
|
r4_avx = _mm256_slli_epi16(r0_avx, 2);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
|
|
r4_avx = _mm256_slli_epi16(r4_avx, 1);
|
|
r5_avx = _mm256_slli_epi16(r1_avx, 2);
|
|
r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
|
|
aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
|
|
aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
|
|
r4_avx = _mm256_slli_epi16(r3_avx, 3);
|
|
r6_avx = _mm256_slli_epi16(r2_avx, 2);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
|
|
r6_avx = _mm256_slli_epi16(r1_avx, 1);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
|
|
aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
|
|
aw_avx[6 * L + i] = r0_avx;
|
|
aw_avx[0 * L + i] = r3_avx;
|
|
}
|
|
|
|
batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate);
|
|
}
|
|
|
|
void PQCLEAN_LIGHTSABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) {
|
|
size_t i;
|
|
__m256i bw_avx[7 * L];
|
|
__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
|
|
__m256i *vb = (__m256i *)b->coeffs;
|
|
__m256i *vb_eval = (__m256i *)b_eval->coeffs;
|
|
|
|
for (i = 0; i < L; i++) {
|
|
r0_avx = vb[0 * L + i];
|
|
r1_avx = vb[1 * L + i];
|
|
r2_avx = vb[2 * L + i];
|
|
r3_avx = vb[3 * L + i];
|
|
r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
|
|
r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
|
|
bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
|
|
bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
|
|
r4_avx = _mm256_slli_epi16(r0_avx, 2);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
|
|
r4_avx = _mm256_slli_epi16(r4_avx, 1);
|
|
r5_avx = _mm256_slli_epi16(r1_avx, 2);
|
|
r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
|
|
bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
|
|
bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
|
|
r4_avx = _mm256_slli_epi16(r3_avx, 3);
|
|
r6_avx = _mm256_slli_epi16(r2_avx, 2);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
|
|
r6_avx = _mm256_slli_epi16(r1_avx, 1);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
|
|
bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
|
|
bw_avx[6 * L + i] = r0_avx;
|
|
bw_avx[0 * L + i] = r3_avx;
|
|
}
|
|
|
|
karatsuba_eval(vb_eval, bw_avx);
|
|
}
|
|
|
|
|
|
void PQCLEAN_LIGHTSABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) {
|
|
size_t i;
|
|
__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
|
|
__m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L];
|
|
__m256i res_full[32];
|
|
__m256i *vc = (__m256i *)c_eval->coeffs;
|
|
__m256i *vres = (__m256i *)res->coeffs;
|
|
|
|
transpose(vc);
|
|
transpose(vc + 16);
|
|
transpose(vc + 32);
|
|
transpose(vc + 48);
|
|
transpose(vc + 64);
|
|
transpose(vc + 80);
|
|
transpose(vc + 96);
|
|
transpose(vc + 112);
|
|
|
|
karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc);
|
|
|
|
for (i = 0; i < 2 * L; i++) {
|
|
r0_avx = w1_avx[i];
|
|
r1_avx = w2_avx[i];
|
|
r2_avx = w3_avx[i];
|
|
r3_avx = w4_avx[i];
|
|
r4_avx = w5_avx[i];
|
|
r5_avx = w6_avx[i];
|
|
r6_avx = w7_avx[i];
|
|
|
|
r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
|
|
r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
|
|
r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
|
|
r3_avx = _mm256_srli_epi16(r3_avx, 1);
|
|
r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
|
|
temp_avx = _mm256_slli_epi16(r6_avx, 6);
|
|
|
|
r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
|
|
r4_avx = _mm256_slli_epi16(r4_avx, 1);
|
|
r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
|
|
r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
|
|
temp_avx = _mm256_slli_epi16(r2_avx, 6);
|
|
|
|
r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
|
|
r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
|
|
r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
|
|
r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
|
|
temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45));
|
|
|
|
r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
|
|
temp_avx = _mm256_slli_epi16(r2_avx, 3);
|
|
|
|
r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
|
|
r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
|
|
r4_avx = _mm256_srli_epi16(r4_avx, 3);
|
|
r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
|
|
temp_avx = _mm256_slli_epi16(r3_avx, 4);
|
|
|
|
r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
|
|
r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
|
|
r1_avx = _mm256_srli_epi16(r1_avx, 1);
|
|
r3_avx = _mm256_add_epi16(r1_avx, r3_avx);
|
|
r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
|
|
temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30));
|
|
temp_avx = _mm256_sub_epi16(temp_avx, r5_avx);
|
|
temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
|
|
|
|
r5_avx = _mm256_srli_epi16(temp_avx, 2);
|
|
r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
|
|
r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
|
|
|
|
if (i < L) {
|
|
res_full[0 * L + i] = r6_avx;
|
|
res_full[1 * L + i] = r5_avx;
|
|
res_full[2 * L + i] = r4_avx;
|
|
res_full[3 * L + i] = r3_avx;
|
|
res_full[4 * L + i] = r2_avx;
|
|
res_full[5 * L + i] = r1_avx;
|
|
res_full[6 * L + i] = r0_avx;
|
|
} else {
|
|
res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx);
|
|
res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx);
|
|
res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx);
|
|
res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx);
|
|
res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx);
|
|
res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx);
|
|
res_full[6 * L + i] = r0_avx;
|
|
}
|
|
}
|
|
|
|
// Reduction by X^256 + 1
|
|
for (i = 0; i < 16; i++) {
|
|
vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]);
|
|
}
|
|
}
|