pqc/common/keccak4x/KeccakP-1600-times4-SIMD256.c
Thom Wiggers f4bd312180 Adds AVX2 variants of Kyber512, Kyber768, Kyber1024 (#225)
* Integrate Kyber-AVX2 into PQClean

* Fix types and formatting in Kyber

* Workaround a valgrind crash

* Remove comment in shuffle.s

* Remove some extraneous truncations

* fixup! Fix types and formatting in Kyber
2019-09-10 11:45:01 +02:00

1036 lines
39 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
For more information, feedback or questions, please refer to our website:
https://keccak.team/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
---
This file implements Keccak-p[1600]×4 in a PlSnP-compatible way.
Please refer to PlSnP-documentation.h for more details.
This implementation comes with KeccakP-1600-times4-SnP.h in the same folder.
Please refer to LowLevel.build for the exact list of other files it must be combined with.
*/
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "align.h"
#include "KeccakP-1600-times4-SnP.h"
#include "SIMD256-config.h"
#include "brg_endian.h"
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
#error Expecting a little-endian platform
#endif
typedef unsigned char UINT8;
typedef unsigned long long int UINT64;
typedef __m128i V128;
typedef __m256i V256;
//#define UseGatherScatter
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
#if defined(KeccakP1600times4_useAVX2)
#define ANDnu256(a, b) _mm256_andnot_si256(a, b)
#define CONST256(a) _mm256_load_si256((const V256 *)&(a))
#define CONST256_64(a) _mm256_set1_epi64x(a)
#define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
#define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
#define STORE2_128(ah, al, v) _mm256_storeu2_m128i(&(ah), &(al), v)
#define XOR256(a, b) _mm256_xor_si256(a, b)
#define XOReq256(a, b) a = _mm256_xor_si256(a, b)
#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
#define PERM128( a, b, c ) _mm256_permute2f128_si256((a), (b), c)
#define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
#define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
lanesH01 = UNPACKH( lanes0, lanes1 ), \
lanesL23 = UNPACKL( lanes2, lanes3 ), \
lanesH23 = UNPACKH( lanes2, lanes3 ), \
lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
#define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
#endif
#define SnP_laneLengthInBytes 8
void KeccakP1600times4_InitializeAll(void *states)
{
memset(states, 0, KeccakP1600times4_statesSizeInBytes);
}
void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
const unsigned char *curData = data;
UINT64 *statesAsLanes = (UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
UINT64 lane = 0;
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
sizeLeft -= bytesInLane;
lanePosition++;
curData += bytesInLane;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
UINT64 lane = *((const UINT64*)curData);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curData += SnP_laneLengthInBytes;
}
if (sizeLeft > 0) {
UINT64 lane = 0;
memcpy(&lane, curData, sizeLeft);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
}
}
void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
V256 *stateAsLanes = (V256 *)states;
unsigned int i;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
lanes1 = LOAD256u( curData1[argIndex]),\
lanes2 = LOAD256u( curData2[argIndex]),\
lanes3 = LOAD256u( curData3[argIndex]),\
INTLEAVE(),\
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
XOReq256( stateAsLanes[argIndex+3], lanes3 )
if ( laneCount >= 16 ) {
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
if ( laneCount >= 20 ) {
Xor_In4( 16 );
for(i=20; i<laneCount; i++)
Xor_In( i );
}
else {
for(i=16; i<laneCount; i++)
Xor_In( i );
}
}
else {
for(i=0; i<laneCount; i++)
Xor_In( i );
}
#undef Xor_In
#undef Xor_In4
}
void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
const unsigned char *curData = data;
UINT64 *statesAsLanes = (UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
sizeLeft -= bytesInLane;
lanePosition++;
curData += bytesInLane;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
UINT64 lane = *((const UINT64*)curData);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curData += SnP_laneLengthInBytes;
}
if (sizeLeft > 0) {
memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
}
}
void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
V256 *stateAsLanes = (V256 *)states;
unsigned int i;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
lanes1 = LOAD256u( curData1[argIndex]),\
lanes2 = LOAD256u( curData2[argIndex]),\
lanes3 = LOAD256u( curData3[argIndex]),\
INTLEAVE(),\
STORE256( stateAsLanes[argIndex+0], lanes0 ),\
STORE256( stateAsLanes[argIndex+1], lanes1 ),\
STORE256( stateAsLanes[argIndex+2], lanes2 ),\
STORE256( stateAsLanes[argIndex+3], lanes3 )
if ( laneCount >= 16 ) {
OverWr4( 0 );
OverWr4( 4 );
OverWr4( 8 );
OverWr4( 12 );
if ( laneCount >= 20 ) {
OverWr4( 16 );
for(i=20; i<laneCount; i++)
OverWr( i );
}
else {
for(i=16; i<laneCount; i++)
OverWr( i );
}
}
else {
for(i=0; i<laneCount; i++)
OverWr( i );
}
#undef OverWr
#undef OverWr4
}
void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
{
unsigned int sizeLeft = byteCount;
unsigned int lanePosition = 0;
UINT64 *statesAsLanes = (UINT64 *)states;
while(sizeLeft >= SnP_laneLengthInBytes) {
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
}
if (sizeLeft > 0) {
memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
}
}
void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
unsigned char *curData = data;
const UINT64 *statesAsLanes = (const UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
sizeLeft -= bytesInLane;
lanePosition++;
curData += bytesInLane;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
*(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curData += SnP_laneLengthInBytes;
}
if (sizeLeft > 0) {
memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
}
}
void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
UINT64 *curData0 = (UINT64 *)data;
UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
const V256 *stateAsLanes = (const V256 *)states;
const UINT64 *stateAsLanes64 = (const UINT64*)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
unsigned int i;
#define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
#define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
UNINTLEAVE(), \
STORE256u( curData0[argIndex], lanes0 ), \
STORE256u( curData1[argIndex], lanes1 ), \
STORE256u( curData2[argIndex], lanes2 ), \
STORE256u( curData3[argIndex], lanes3 )
if ( laneCount >= 16 ) {
Extr4( 0 );
Extr4( 4 );
Extr4( 8 );
Extr4( 12 );
if ( laneCount >= 20 ) {
Extr4( 16 );
for(i=20; i<laneCount; i++)
Extr( i );
}
else {
for(i=16; i<laneCount; i++)
Extr( i );
}
}
else {
for(i=0; i<laneCount; i++)
Extr( i );
}
#undef Extr
#undef Extr4
}
void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
const unsigned char *curInput = input;
unsigned char *curOutput = output;
const UINT64 *statesAsLanes = (const UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
sizeLeft -= bytesInLane;
do {
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
lane >>= 8;
} while ( --bytesInLane != 0);
lanePosition++;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
*((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curInput += SnP_laneLengthInBytes;
curOutput += SnP_laneLengthInBytes;
}
if (sizeLeft != 0) {
UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
do {
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
lane >>= 8;
} while ( --sizeLeft != 0);
}
}
void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
{
const UINT64 *curInput0 = (UINT64 *)input;
const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
UINT64 *curOutput0 = (UINT64 *)output;
UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
const V256 *stateAsLanes = (const V256 *)states;
const UINT64 *stateAsLanes64 = (const UINT64*)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
unsigned int i;
#define ExtrXor( argIndex ) \
curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
#define ExtrXor4( argIndex ) \
lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
UNINTLEAVE(),\
lanesL01 = LOAD256u( curInput0[argIndex]),\
lanesH01 = LOAD256u( curInput1[argIndex]),\
lanesL23 = LOAD256u( curInput2[argIndex]),\
lanesH23 = LOAD256u( curInput3[argIndex]),\
XOReq256( lanes0, lanesL01 ),\
XOReq256( lanes1, lanesH01 ),\
XOReq256( lanes2, lanesL23 ),\
XOReq256( lanes3, lanesH23 ),\
STORE256u( curOutput0[argIndex], lanes0 ),\
STORE256u( curOutput1[argIndex], lanes1 ),\
STORE256u( curOutput2[argIndex], lanes2 ),\
STORE256u( curOutput3[argIndex], lanes3 )
if ( laneCount >= 16 ) {
ExtrXor4( 0 );
ExtrXor4( 4 );
ExtrXor4( 8 );
ExtrXor4( 12 );
if ( laneCount >= 20 ) {
ExtrXor4( 16 );
for(i=20; i<laneCount; i++)
ExtrXor( i );
}
else {
for(i=16; i<laneCount; i++)
ExtrXor( i );
}
}
else {
for(i=0; i<laneCount; i++)
ExtrXor( i );
}
#undef ExtrXor
#undef ExtrXor4
}
#define declareABCDE \
V256 Aba, Abe, Abi, Abo, Abu; \
V256 Aga, Age, Agi, Ago, Agu; \
V256 Aka, Ake, Aki, Ako, Aku; \
V256 Ama, Ame, Ami, Amo, Amu; \
V256 Asa, Ase, Asi, Aso, Asu; \
V256 Bba, Bbe, Bbi, Bbo, Bbu; \
V256 Bga, Bge, Bgi, Bgo, Bgu; \
V256 Bka, Bke, Bki, Bko, Bku; \
V256 Bma, Bme, Bmi, Bmo, Bmu; \
V256 Bsa, Bse, Bsi, Bso, Bsu; \
V256 Ca, Ce, Ci, Co, Cu; \
V256 Ca1, Ce1, Ci1, Co1, Cu1; \
V256 Da, De, Di, Do, Du; \
V256 Eba, Ebe, Ebi, Ebo, Ebu; \
V256 Ega, Ege, Egi, Ego, Egu; \
V256 Eka, Eke, Eki, Eko, Eku; \
V256 Ema, Eme, Emi, Emo, Emu; \
V256 Esa, Ese, Esi, Eso, Esu; \
#define prepareTheta \
Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
/* --- Theta Rho Pi Chi Iota Prepare-theta */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
ROL64in256(Ce1, Ce, 1); \
Da = XOR256(Cu, Ce1); \
ROL64in256(Ci1, Ci, 1); \
De = XOR256(Ca, Ci1); \
ROL64in256(Co1, Co, 1); \
Di = XOR256(Ce, Co1); \
ROL64in256(Cu1, Cu, 1); \
Do = XOR256(Ci, Cu1); \
ROL64in256(Ca1, Ca, 1); \
Du = XOR256(Co, Ca1); \
\
XOReq256(A##ba, Da); \
Bba = A##ba; \
XOReq256(A##ge, De); \
ROL64in256(Bbe, A##ge, 44); \
XOReq256(A##ki, Di); \
ROL64in256(Bbi, A##ki, 43); \
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
Ca = E##ba; \
XOReq256(A##mo, Do); \
ROL64in256(Bbo, A##mo, 21); \
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
Ce = E##be; \
XOReq256(A##su, Du); \
ROL64in256(Bbu, A##su, 14); \
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
Ci = E##bi; \
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
Co = E##bo; \
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
Cu = E##bu; \
\
XOReq256(A##bo, Do); \
ROL64in256(Bga, A##bo, 28); \
XOReq256(A##gu, Du); \
ROL64in256(Bge, A##gu, 20); \
XOReq256(A##ka, Da); \
ROL64in256(Bgi, A##ka, 3); \
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
XOReq256(Ca, E##ga); \
XOReq256(A##me, De); \
ROL64in256(Bgo, A##me, 45); \
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
XOReq256(Ce, E##ge); \
XOReq256(A##si, Di); \
ROL64in256(Bgu, A##si, 61); \
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
XOReq256(Ci, E##gi); \
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
XOReq256(Co, E##go); \
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
XOReq256(Cu, E##gu); \
\
XOReq256(A##be, De); \
ROL64in256(Bka, A##be, 1); \
XOReq256(A##gi, Di); \
ROL64in256(Bke, A##gi, 6); \
XOReq256(A##ko, Do); \
ROL64in256(Bki, A##ko, 25); \
E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
XOReq256(Ca, E##ka); \
XOReq256(A##mu, Du); \
ROL64in256_8(Bko, A##mu); \
E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
XOReq256(Ce, E##ke); \
XOReq256(A##sa, Da); \
ROL64in256(Bku, A##sa, 18); \
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
XOReq256(Ci, E##ki); \
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
XOReq256(Co, E##ko); \
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
XOReq256(Cu, E##ku); \
\
XOReq256(A##bu, Du); \
ROL64in256(Bma, A##bu, 27); \
XOReq256(A##ga, Da); \
ROL64in256(Bme, A##ga, 36); \
XOReq256(A##ke, De); \
ROL64in256(Bmi, A##ke, 10); \
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
XOReq256(Ca, E##ma); \
XOReq256(A##mi, Di); \
ROL64in256(Bmo, A##mi, 15); \
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
XOReq256(Ce, E##me); \
XOReq256(A##so, Do); \
ROL64in256_56(Bmu, A##so); \
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
XOReq256(Ci, E##mi); \
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
XOReq256(Co, E##mo); \
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
XOReq256(Cu, E##mu); \
\
XOReq256(A##bi, Di); \
ROL64in256(Bsa, A##bi, 62); \
XOReq256(A##go, Do); \
ROL64in256(Bse, A##go, 55); \
XOReq256(A##ku, Du); \
ROL64in256(Bsi, A##ku, 39); \
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
XOReq256(Ca, E##sa); \
XOReq256(A##ma, Da); \
ROL64in256(Bso, A##ma, 41); \
E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
XOReq256(Ce, E##se); \
XOReq256(A##se, De); \
ROL64in256(Bsu, A##se, 2); \
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
XOReq256(Ci, E##si); \
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
XOReq256(Co, E##so); \
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
XOReq256(Cu, E##su); \
\
/* --- Theta Rho Pi Chi Iota */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIota(i, A, E) \
ROL64in256(Ce1, Ce, 1); \
Da = XOR256(Cu, Ce1); \
ROL64in256(Ci1, Ci, 1); \
De = XOR256(Ca, Ci1); \
ROL64in256(Co1, Co, 1); \
Di = XOR256(Ce, Co1); \
ROL64in256(Cu1, Cu, 1); \
Do = XOR256(Ci, Cu1); \
ROL64in256(Ca1, Ca, 1); \
Du = XOR256(Co, Ca1); \
\
XOReq256(A##ba, Da); \
Bba = A##ba; \
XOReq256(A##ge, De); \
ROL64in256(Bbe, A##ge, 44); \
XOReq256(A##ki, Di); \
ROL64in256(Bbi, A##ki, 43); \
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
XOReq256(A##mo, Do); \
ROL64in256(Bbo, A##mo, 21); \
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
XOReq256(A##su, Du); \
ROL64in256(Bbu, A##su, 14); \
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
\
XOReq256(A##bo, Do); \
ROL64in256(Bga, A##bo, 28); \
XOReq256(A##gu, Du); \
ROL64in256(Bge, A##gu, 20); \
XOReq256(A##ka, Da); \
ROL64in256(Bgi, A##ka, 3); \
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
XOReq256(A##me, De); \
ROL64in256(Bgo, A##me, 45); \
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
XOReq256(A##si, Di); \
ROL64in256(Bgu, A##si, 61); \
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
\
XOReq256(A##be, De); \
ROL64in256(Bka, A##be, 1); \
XOReq256(A##gi, Di); \
ROL64in256(Bke, A##gi, 6); \
XOReq256(A##ko, Do); \
ROL64in256(Bki, A##ko, 25); \
E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
XOReq256(A##mu, Du); \
ROL64in256_8(Bko, A##mu); \
E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
XOReq256(A##sa, Da); \
ROL64in256(Bku, A##sa, 18); \
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
\
XOReq256(A##bu, Du); \
ROL64in256(Bma, A##bu, 27); \
XOReq256(A##ga, Da); \
ROL64in256(Bme, A##ga, 36); \
XOReq256(A##ke, De); \
ROL64in256(Bmi, A##ke, 10); \
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
XOReq256(A##mi, Di); \
ROL64in256(Bmo, A##mi, 15); \
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
XOReq256(A##so, Do); \
ROL64in256_56(Bmu, A##so); \
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
\
XOReq256(A##bi, Di); \
ROL64in256(Bsa, A##bi, 62); \
XOReq256(A##go, Do); \
ROL64in256(Bse, A##go, 55); \
XOReq256(A##ku, Du); \
ROL64in256(Bsi, A##ku, 39); \
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
XOReq256(A##ma, Da); \
ROL64in256(Bso, A##ma, 41); \
E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
XOReq256(A##se, De); \
ROL64in256(Bsu, A##se, 2); \
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
\
static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
0x0000000000000001ULL,
0x0000000000008082ULL,
0x800000000000808aULL,
0x8000000080008000ULL,
0x000000000000808bULL,
0x0000000080000001ULL,
0x8000000080008081ULL,
0x8000000000008009ULL,
0x000000000000008aULL,
0x0000000000000088ULL,
0x0000000080008009ULL,
0x000000008000000aULL,
0x000000008000808bULL,
0x800000000000008bULL,
0x8000000000008089ULL,
0x8000000000008003ULL,
0x8000000000008002ULL,
0x8000000000000080ULL,
0x000000000000800aULL,
0x800000008000000aULL,
0x8000000080008081ULL,
0x8000000000008080ULL,
0x0000000080000001ULL,
0x8000000080008008ULL};
#define copyFromState(X, state) \
X##ba = LOAD256(state[ 0]); \
X##be = LOAD256(state[ 1]); \
X##bi = LOAD256(state[ 2]); \
X##bo = LOAD256(state[ 3]); \
X##bu = LOAD256(state[ 4]); \
X##ga = LOAD256(state[ 5]); \
X##ge = LOAD256(state[ 6]); \
X##gi = LOAD256(state[ 7]); \
X##go = LOAD256(state[ 8]); \
X##gu = LOAD256(state[ 9]); \
X##ka = LOAD256(state[10]); \
X##ke = LOAD256(state[11]); \
X##ki = LOAD256(state[12]); \
X##ko = LOAD256(state[13]); \
X##ku = LOAD256(state[14]); \
X##ma = LOAD256(state[15]); \
X##me = LOAD256(state[16]); \
X##mi = LOAD256(state[17]); \
X##mo = LOAD256(state[18]); \
X##mu = LOAD256(state[19]); \
X##sa = LOAD256(state[20]); \
X##se = LOAD256(state[21]); \
X##si = LOAD256(state[22]); \
X##so = LOAD256(state[23]); \
X##su = LOAD256(state[24]); \
#define copyToState(state, X) \
STORE256(state[ 0], X##ba); \
STORE256(state[ 1], X##be); \
STORE256(state[ 2], X##bi); \
STORE256(state[ 3], X##bo); \
STORE256(state[ 4], X##bu); \
STORE256(state[ 5], X##ga); \
STORE256(state[ 6], X##ge); \
STORE256(state[ 7], X##gi); \
STORE256(state[ 8], X##go); \
STORE256(state[ 9], X##gu); \
STORE256(state[10], X##ka); \
STORE256(state[11], X##ke); \
STORE256(state[12], X##ki); \
STORE256(state[13], X##ko); \
STORE256(state[14], X##ku); \
STORE256(state[15], X##ma); \
STORE256(state[16], X##me); \
STORE256(state[17], X##mi); \
STORE256(state[18], X##mo); \
STORE256(state[19], X##mu); \
STORE256(state[20], X##sa); \
STORE256(state[21], X##se); \
STORE256(state[22], X##si); \
STORE256(state[23], X##so); \
STORE256(state[24], X##su); \
#define copyStateVariables(X, Y) \
X##ba = Y##ba; \
X##be = Y##be; \
X##bi = Y##bi; \
X##bo = Y##bo; \
X##bu = Y##bu; \
X##ga = Y##ga; \
X##ge = Y##ge; \
X##gi = Y##gi; \
X##go = Y##go; \
X##gu = Y##gu; \
X##ka = Y##ka; \
X##ke = Y##ke; \
X##ki = Y##ki; \
X##ko = Y##ko; \
X##ku = Y##ku; \
X##ma = Y##ma; \
X##me = Y##me; \
X##mi = Y##mi; \
X##mo = Y##mo; \
X##mu = Y##mu; \
X##sa = Y##sa; \
X##se = Y##se; \
X##si = Y##si; \
X##so = Y##so; \
X##su = Y##su; \
#ifdef KeccakP1600times4_fullUnrolling
#define FullUnrolling
#else
#define Unrolling KeccakP1600times4_unrolling
#endif
#include "KeccakP-1600-unrolling.macros"
void KeccakP1600times4_PermuteAll_24rounds(void *states)
{
V256 *statesAsLanes = (V256 *)states;
declareABCDE
#ifndef KeccakP1600times4_fullUnrolling
unsigned int i;
#endif
copyFromState(A, statesAsLanes)
rounds24
copyToState(statesAsLanes, A)
}
void KeccakP1600times4_PermuteAll_12rounds(void *states)
{
V256 *statesAsLanes = (V256 *)states;
declareABCDE
#ifndef KeccakP1600times4_fullUnrolling
unsigned int i;
#endif
copyFromState(A, statesAsLanes)
rounds12
copyToState(statesAsLanes, A)
}
size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{
if (laneCount == 21) {
#if 0
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
V256 *stateAsLanes = (V256 *)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex ) \
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex ) \
lanes0 = LOAD256u( curData0[argIndex]),\
lanes1 = LOAD256u( curData1[argIndex]),\
lanes2 = LOAD256u( curData2[argIndex]),\
lanes3 = LOAD256u( curData3[argIndex]),\
INTLEAVE(),\
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
XOReq256( stateAsLanes[argIndex+3], lanes3 )
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
Xor_In4( 16 );
Xor_In( 20 );
#undef Xor_In
#undef Xor_In4
KeccakP1600times4_PermuteAll_24rounds(states);
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
return (const unsigned char *)curData0 - dataStart;
#else
// unsigned int i;
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
V256 *statesAsLanes = (V256 *)states;
declareABCDE
copyFromState(A, statesAsLanes)
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
#define XOR_In( Xxx, argIndex ) \
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
XOR_In( Aba, 0 );
XOR_In( Abe, 1 );
XOR_In( Abi, 2 );
XOR_In( Abo, 3 );
XOR_In( Abu, 4 );
XOR_In( Aga, 5 );
XOR_In( Age, 6 );
XOR_In( Agi, 7 );
XOR_In( Ago, 8 );
XOR_In( Agu, 9 );
XOR_In( Aka, 10 );
XOR_In( Ake, 11 );
XOR_In( Aki, 12 );
XOR_In( Ako, 13 );
XOR_In( Aku, 14 );
XOR_In( Ama, 15 );
XOR_In( Ame, 16 );
XOR_In( Ami, 17 );
XOR_In( Amo, 18 );
XOR_In( Amu, 19 );
XOR_In( Asa, 20 );
#undef XOR_In
rounds24
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
copyToState(statesAsLanes, A)
return (const unsigned char *)curData0 - dataStart;
#endif
}
else {
// unsigned int i;
const unsigned char *dataStart = data;
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
KeccakP1600times4_PermuteAll_24rounds(states);
data += laneOffsetSerial*8;
dataByteLen -= laneOffsetSerial*8;
}
return data - dataStart;
}
}
size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{
if (laneCount == 21) {
#if 0
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
V256 *stateAsLanes = states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex ) \
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex ) \
lanes0 = LOAD256u( curData0[argIndex]),\
lanes1 = LOAD256u( curData1[argIndex]),\
lanes2 = LOAD256u( curData2[argIndex]),\
lanes3 = LOAD256u( curData3[argIndex]),\
INTLEAVE(),\
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
XOReq256( stateAsLanes[argIndex+3], lanes3 )
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
Xor_In4( 16 );
Xor_In( 20 );
#undef Xor_In
#undef Xor_In4
KeccakP1600times4_PermuteAll_12rounds(states);
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
return (const unsigned char *)curData0 - dataStart;
#else
// unsigned int i;
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
V256 *statesAsLanes = states;
declareABCDE
copyFromState(A, statesAsLanes)
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
#define XOR_In( Xxx, argIndex ) \
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
XOR_In( Aba, 0 );
XOR_In( Abe, 1 );
XOR_In( Abi, 2 );
XOR_In( Abo, 3 );
XOR_In( Abu, 4 );
XOR_In( Aga, 5 );
XOR_In( Age, 6 );
XOR_In( Agi, 7 );
XOR_In( Ago, 8 );
XOR_In( Agu, 9 );
XOR_In( Aka, 10 );
XOR_In( Ake, 11 );
XOR_In( Aki, 12 );
XOR_In( Ako, 13 );
XOR_In( Aku, 14 );
XOR_In( Ama, 15 );
XOR_In( Ame, 16 );
XOR_In( Ami, 17 );
XOR_In( Amo, 18 );
XOR_In( Amu, 19 );
XOR_In( Asa, 20 );
#undef XOR_In
rounds12
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
copyToState(statesAsLanes, A)
return (const unsigned char *)curData0 - dataStart;
#endif
}
else {
// unsigned int i;
const unsigned char *dataStart = data;
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
KeccakP1600times4_PermuteAll_12rounds(states);
data += laneOffsetSerial*8;
dataByteLen -= laneOffsetSerial*8;
}
return data - dataStart;
}
}