mirror of
https://github.com/henrydcase/pqc.git
synced 2024-11-23 07:59:01 +00:00
1036 lines
39 KiB
C
1036 lines
39 KiB
C
|
/*
|
|||
|
Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
|
|||
|
|
|||
|
For more information, feedback or questions, please refer to our website:
|
|||
|
https://keccak.team/
|
|||
|
|
|||
|
To the extent possible under law, the implementer has waived all copyright
|
|||
|
and related or neighboring rights to the source code in this file.
|
|||
|
http://creativecommons.org/publicdomain/zero/1.0/
|
|||
|
|
|||
|
---
|
|||
|
|
|||
|
This file implements Keccak-p[1600]×4 in a PlSnP-compatible way.
|
|||
|
Please refer to PlSnP-documentation.h for more details.
|
|||
|
|
|||
|
This implementation comes with KeccakP-1600-times4-SnP.h in the same folder.
|
|||
|
Please refer to LowLevel.build for the exact list of other files it must be combined with.
|
|||
|
*/
|
|||
|
|
|||
|
#include <immintrin.h>
|
|||
|
#include <stdint.h>
|
|||
|
#include <stdio.h>
|
|||
|
#include <stdlib.h>
|
|||
|
#include <string.h>
|
|||
|
|
|||
|
#include "align.h"
|
|||
|
#include "KeccakP-1600-times4-SnP.h"
|
|||
|
#include "SIMD256-config.h"
|
|||
|
|
|||
|
#include "brg_endian.h"
|
|||
|
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
|
|||
|
#error Expecting a little-endian platform
|
|||
|
#endif
|
|||
|
|
|||
|
typedef unsigned char UINT8;
|
|||
|
typedef unsigned long long int UINT64;
|
|||
|
typedef __m128i V128;
|
|||
|
typedef __m256i V256;
|
|||
|
|
|||
|
//#define UseGatherScatter
|
|||
|
|
|||
|
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
|
|||
|
|
|||
|
#if defined(KeccakP1600times4_useAVX2)
|
|||
|
#define ANDnu256(a, b) _mm256_andnot_si256(a, b)
|
|||
|
#define CONST256(a) _mm256_load_si256((const V256 *)&(a))
|
|||
|
#define CONST256_64(a) _mm256_set1_epi64x(a)
|
|||
|
#define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
|
|||
|
#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
|
|||
|
#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
|
|||
|
#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
|
|||
|
#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
|
|||
|
#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
|
|||
|
static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
|
|||
|
static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
|
|||
|
#define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
|
|||
|
#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
|
|||
|
#define STORE2_128(ah, al, v) _mm256_storeu2_m128i(&(ah), &(al), v)
|
|||
|
#define XOR256(a, b) _mm256_xor_si256(a, b)
|
|||
|
#define XOReq256(a, b) a = _mm256_xor_si256(a, b)
|
|||
|
#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
|
|||
|
#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
|
|||
|
#define PERM128( a, b, c ) _mm256_permute2f128_si256((a), (b), c)
|
|||
|
#define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
|
|||
|
|
|||
|
#define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
|
|||
|
lanesH01 = UNPACKH( lanes0, lanes1 ), \
|
|||
|
lanesL23 = UNPACKL( lanes2, lanes3 ), \
|
|||
|
lanesH23 = UNPACKH( lanes2, lanes3 ), \
|
|||
|
lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
|
|||
|
lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
|
|||
|
lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
|
|||
|
lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
|
|||
|
|
|||
|
#define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
|
|||
|
lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
|
|||
|
lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
|
|||
|
lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
|
|||
|
lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
|
|||
|
lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
|
|||
|
lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
|
|||
|
lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
|
|||
|
|
|||
|
#endif
|
|||
|
|
|||
|
#define SnP_laneLengthInBytes 8
|
|||
|
|
|||
|
void KeccakP1600times4_InitializeAll(void *states)
|
|||
|
{
|
|||
|
memset(states, 0, KeccakP1600times4_statesSizeInBytes);
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
|
|||
|
{
|
|||
|
unsigned int sizeLeft = length;
|
|||
|
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|||
|
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|||
|
const unsigned char *curData = data;
|
|||
|
UINT64 *statesAsLanes = (UINT64 *)states;
|
|||
|
|
|||
|
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|||
|
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|||
|
UINT64 lane = 0;
|
|||
|
if (bytesInLane > sizeLeft)
|
|||
|
bytesInLane = sizeLeft;
|
|||
|
memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
|
|||
|
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|||
|
sizeLeft -= bytesInLane;
|
|||
|
lanePosition++;
|
|||
|
curData += bytesInLane;
|
|||
|
}
|
|||
|
|
|||
|
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|||
|
UINT64 lane = *((const UINT64*)curData);
|
|||
|
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|||
|
sizeLeft -= SnP_laneLengthInBytes;
|
|||
|
lanePosition++;
|
|||
|
curData += SnP_laneLengthInBytes;
|
|||
|
}
|
|||
|
|
|||
|
if (sizeLeft > 0) {
|
|||
|
UINT64 lane = 0;
|
|||
|
memcpy(&lane, curData, sizeLeft);
|
|||
|
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|||
|
{
|
|||
|
V256 *stateAsLanes = (V256 *)states;
|
|||
|
unsigned int i;
|
|||
|
const UINT64 *curData0 = (const UINT64 *)data;
|
|||
|
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
|
|||
|
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
|
|||
|
|
|||
|
#define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
|
|||
|
|
|||
|
#define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
|
|||
|
lanes1 = LOAD256u( curData1[argIndex]),\
|
|||
|
lanes2 = LOAD256u( curData2[argIndex]),\
|
|||
|
lanes3 = LOAD256u( curData3[argIndex]),\
|
|||
|
INTLEAVE(),\
|
|||
|
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+3], lanes3 )
|
|||
|
|
|||
|
if ( laneCount >= 16 ) {
|
|||
|
Xor_In4( 0 );
|
|||
|
Xor_In4( 4 );
|
|||
|
Xor_In4( 8 );
|
|||
|
Xor_In4( 12 );
|
|||
|
if ( laneCount >= 20 ) {
|
|||
|
Xor_In4( 16 );
|
|||
|
for(i=20; i<laneCount; i++)
|
|||
|
Xor_In( i );
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=16; i<laneCount; i++)
|
|||
|
Xor_In( i );
|
|||
|
}
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=0; i<laneCount; i++)
|
|||
|
Xor_In( i );
|
|||
|
}
|
|||
|
#undef Xor_In
|
|||
|
#undef Xor_In4
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
|
|||
|
{
|
|||
|
unsigned int sizeLeft = length;
|
|||
|
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|||
|
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|||
|
const unsigned char *curData = data;
|
|||
|
UINT64 *statesAsLanes = (UINT64 *)states;
|
|||
|
|
|||
|
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|||
|
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|||
|
if (bytesInLane > sizeLeft)
|
|||
|
bytesInLane = sizeLeft;
|
|||
|
memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
|
|||
|
sizeLeft -= bytesInLane;
|
|||
|
lanePosition++;
|
|||
|
curData += bytesInLane;
|
|||
|
}
|
|||
|
|
|||
|
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|||
|
UINT64 lane = *((const UINT64*)curData);
|
|||
|
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
|
|||
|
sizeLeft -= SnP_laneLengthInBytes;
|
|||
|
lanePosition++;
|
|||
|
curData += SnP_laneLengthInBytes;
|
|||
|
}
|
|||
|
|
|||
|
if (sizeLeft > 0) {
|
|||
|
memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|||
|
{
|
|||
|
V256 *stateAsLanes = (V256 *)states;
|
|||
|
unsigned int i;
|
|||
|
const UINT64 *curData0 = (const UINT64 *)data;
|
|||
|
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
|
|||
|
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
|
|||
|
|
|||
|
#define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
|
|||
|
|
|||
|
#define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
|
|||
|
lanes1 = LOAD256u( curData1[argIndex]),\
|
|||
|
lanes2 = LOAD256u( curData2[argIndex]),\
|
|||
|
lanes3 = LOAD256u( curData3[argIndex]),\
|
|||
|
INTLEAVE(),\
|
|||
|
STORE256( stateAsLanes[argIndex+0], lanes0 ),\
|
|||
|
STORE256( stateAsLanes[argIndex+1], lanes1 ),\
|
|||
|
STORE256( stateAsLanes[argIndex+2], lanes2 ),\
|
|||
|
STORE256( stateAsLanes[argIndex+3], lanes3 )
|
|||
|
|
|||
|
if ( laneCount >= 16 ) {
|
|||
|
OverWr4( 0 );
|
|||
|
OverWr4( 4 );
|
|||
|
OverWr4( 8 );
|
|||
|
OverWr4( 12 );
|
|||
|
if ( laneCount >= 20 ) {
|
|||
|
OverWr4( 16 );
|
|||
|
for(i=20; i<laneCount; i++)
|
|||
|
OverWr( i );
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=16; i<laneCount; i++)
|
|||
|
OverWr( i );
|
|||
|
}
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=0; i<laneCount; i++)
|
|||
|
OverWr( i );
|
|||
|
}
|
|||
|
#undef OverWr
|
|||
|
#undef OverWr4
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
|
|||
|
{
|
|||
|
unsigned int sizeLeft = byteCount;
|
|||
|
unsigned int lanePosition = 0;
|
|||
|
UINT64 *statesAsLanes = (UINT64 *)states;
|
|||
|
|
|||
|
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|||
|
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
|
|||
|
sizeLeft -= SnP_laneLengthInBytes;
|
|||
|
lanePosition++;
|
|||
|
}
|
|||
|
|
|||
|
if (sizeLeft > 0) {
|
|||
|
memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
|
|||
|
{
|
|||
|
unsigned int sizeLeft = length;
|
|||
|
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|||
|
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|||
|
unsigned char *curData = data;
|
|||
|
const UINT64 *statesAsLanes = (const UINT64 *)states;
|
|||
|
|
|||
|
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|||
|
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|||
|
if (bytesInLane > sizeLeft)
|
|||
|
bytesInLane = sizeLeft;
|
|||
|
memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
|
|||
|
sizeLeft -= bytesInLane;
|
|||
|
lanePosition++;
|
|||
|
curData += bytesInLane;
|
|||
|
}
|
|||
|
|
|||
|
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|||
|
*(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|||
|
sizeLeft -= SnP_laneLengthInBytes;
|
|||
|
lanePosition++;
|
|||
|
curData += SnP_laneLengthInBytes;
|
|||
|
}
|
|||
|
|
|||
|
if (sizeLeft > 0) {
|
|||
|
memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
|
|||
|
{
|
|||
|
UINT64 *curData0 = (UINT64 *)data;
|
|||
|
UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
|
|||
|
UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
|
|||
|
UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
|
|||
|
|
|||
|
const V256 *stateAsLanes = (const V256 *)states;
|
|||
|
const UINT64 *stateAsLanes64 = (const UINT64*)states;
|
|||
|
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
|
|||
|
unsigned int i;
|
|||
|
|
|||
|
#define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
|
|||
|
curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
|
|||
|
curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
|
|||
|
curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
|
|||
|
|
|||
|
#define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
|
|||
|
lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
|
|||
|
lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
|
|||
|
lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
|
|||
|
UNINTLEAVE(), \
|
|||
|
STORE256u( curData0[argIndex], lanes0 ), \
|
|||
|
STORE256u( curData1[argIndex], lanes1 ), \
|
|||
|
STORE256u( curData2[argIndex], lanes2 ), \
|
|||
|
STORE256u( curData3[argIndex], lanes3 )
|
|||
|
|
|||
|
if ( laneCount >= 16 ) {
|
|||
|
Extr4( 0 );
|
|||
|
Extr4( 4 );
|
|||
|
Extr4( 8 );
|
|||
|
Extr4( 12 );
|
|||
|
if ( laneCount >= 20 ) {
|
|||
|
Extr4( 16 );
|
|||
|
for(i=20; i<laneCount; i++)
|
|||
|
Extr( i );
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=16; i<laneCount; i++)
|
|||
|
Extr( i );
|
|||
|
}
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=0; i<laneCount; i++)
|
|||
|
Extr( i );
|
|||
|
}
|
|||
|
#undef Extr
|
|||
|
#undef Extr4
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
|
|||
|
{
|
|||
|
unsigned int sizeLeft = length;
|
|||
|
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
|
|||
|
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
|
|||
|
const unsigned char *curInput = input;
|
|||
|
unsigned char *curOutput = output;
|
|||
|
const UINT64 *statesAsLanes = (const UINT64 *)states;
|
|||
|
|
|||
|
if ((sizeLeft > 0) && (offsetInLane != 0)) {
|
|||
|
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
|
|||
|
UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
|
|||
|
if (bytesInLane > sizeLeft)
|
|||
|
bytesInLane = sizeLeft;
|
|||
|
sizeLeft -= bytesInLane;
|
|||
|
do {
|
|||
|
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
|
|||
|
lane >>= 8;
|
|||
|
} while ( --bytesInLane != 0);
|
|||
|
lanePosition++;
|
|||
|
}
|
|||
|
|
|||
|
while(sizeLeft >= SnP_laneLengthInBytes) {
|
|||
|
*((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|||
|
sizeLeft -= SnP_laneLengthInBytes;
|
|||
|
lanePosition++;
|
|||
|
curInput += SnP_laneLengthInBytes;
|
|||
|
curOutput += SnP_laneLengthInBytes;
|
|||
|
}
|
|||
|
|
|||
|
if (sizeLeft != 0) {
|
|||
|
UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
|
|||
|
do {
|
|||
|
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
|
|||
|
lane >>= 8;
|
|||
|
} while ( --sizeLeft != 0);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
|
|||
|
{
|
|||
|
const UINT64 *curInput0 = (UINT64 *)input;
|
|||
|
const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
|
|||
|
UINT64 *curOutput0 = (UINT64 *)output;
|
|||
|
UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
|
|||
|
UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
|
|||
|
UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
|
|||
|
|
|||
|
const V256 *stateAsLanes = (const V256 *)states;
|
|||
|
const UINT64 *stateAsLanes64 = (const UINT64*)states;
|
|||
|
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
|
|||
|
unsigned int i;
|
|||
|
|
|||
|
#define ExtrXor( argIndex ) \
|
|||
|
curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
|
|||
|
curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
|
|||
|
curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
|
|||
|
curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
|
|||
|
|
|||
|
#define ExtrXor4( argIndex ) \
|
|||
|
lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
|
|||
|
lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
|
|||
|
lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
|
|||
|
lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
|
|||
|
UNINTLEAVE(),\
|
|||
|
lanesL01 = LOAD256u( curInput0[argIndex]),\
|
|||
|
lanesH01 = LOAD256u( curInput1[argIndex]),\
|
|||
|
lanesL23 = LOAD256u( curInput2[argIndex]),\
|
|||
|
lanesH23 = LOAD256u( curInput3[argIndex]),\
|
|||
|
XOReq256( lanes0, lanesL01 ),\
|
|||
|
XOReq256( lanes1, lanesH01 ),\
|
|||
|
XOReq256( lanes2, lanesL23 ),\
|
|||
|
XOReq256( lanes3, lanesH23 ),\
|
|||
|
STORE256u( curOutput0[argIndex], lanes0 ),\
|
|||
|
STORE256u( curOutput1[argIndex], lanes1 ),\
|
|||
|
STORE256u( curOutput2[argIndex], lanes2 ),\
|
|||
|
STORE256u( curOutput3[argIndex], lanes3 )
|
|||
|
|
|||
|
if ( laneCount >= 16 ) {
|
|||
|
ExtrXor4( 0 );
|
|||
|
ExtrXor4( 4 );
|
|||
|
ExtrXor4( 8 );
|
|||
|
ExtrXor4( 12 );
|
|||
|
if ( laneCount >= 20 ) {
|
|||
|
ExtrXor4( 16 );
|
|||
|
for(i=20; i<laneCount; i++)
|
|||
|
ExtrXor( i );
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=16; i<laneCount; i++)
|
|||
|
ExtrXor( i );
|
|||
|
}
|
|||
|
}
|
|||
|
else {
|
|||
|
for(i=0; i<laneCount; i++)
|
|||
|
ExtrXor( i );
|
|||
|
}
|
|||
|
#undef ExtrXor
|
|||
|
#undef ExtrXor4
|
|||
|
}
|
|||
|
|
|||
|
#define declareABCDE \
|
|||
|
V256 Aba, Abe, Abi, Abo, Abu; \
|
|||
|
V256 Aga, Age, Agi, Ago, Agu; \
|
|||
|
V256 Aka, Ake, Aki, Ako, Aku; \
|
|||
|
V256 Ama, Ame, Ami, Amo, Amu; \
|
|||
|
V256 Asa, Ase, Asi, Aso, Asu; \
|
|||
|
V256 Bba, Bbe, Bbi, Bbo, Bbu; \
|
|||
|
V256 Bga, Bge, Bgi, Bgo, Bgu; \
|
|||
|
V256 Bka, Bke, Bki, Bko, Bku; \
|
|||
|
V256 Bma, Bme, Bmi, Bmo, Bmu; \
|
|||
|
V256 Bsa, Bse, Bsi, Bso, Bsu; \
|
|||
|
V256 Ca, Ce, Ci, Co, Cu; \
|
|||
|
V256 Ca1, Ce1, Ci1, Co1, Cu1; \
|
|||
|
V256 Da, De, Di, Do, Du; \
|
|||
|
V256 Eba, Ebe, Ebi, Ebo, Ebu; \
|
|||
|
V256 Ega, Ege, Egi, Ego, Egu; \
|
|||
|
V256 Eka, Eke, Eki, Eko, Eku; \
|
|||
|
V256 Ema, Eme, Emi, Emo, Emu; \
|
|||
|
V256 Esa, Ese, Esi, Eso, Esu; \
|
|||
|
|
|||
|
#define prepareTheta \
|
|||
|
Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
|
|||
|
Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
|
|||
|
Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
|
|||
|
Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
|
|||
|
Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
|
|||
|
|
|||
|
/* --- Theta Rho Pi Chi Iota Prepare-theta */
|
|||
|
/* --- 64-bit lanes mapped to 64-bit words */
|
|||
|
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
|
|||
|
ROL64in256(Ce1, Ce, 1); \
|
|||
|
Da = XOR256(Cu, Ce1); \
|
|||
|
ROL64in256(Ci1, Ci, 1); \
|
|||
|
De = XOR256(Ca, Ci1); \
|
|||
|
ROL64in256(Co1, Co, 1); \
|
|||
|
Di = XOR256(Ce, Co1); \
|
|||
|
ROL64in256(Cu1, Cu, 1); \
|
|||
|
Do = XOR256(Ci, Cu1); \
|
|||
|
ROL64in256(Ca1, Ca, 1); \
|
|||
|
Du = XOR256(Co, Ca1); \
|
|||
|
\
|
|||
|
XOReq256(A##ba, Da); \
|
|||
|
Bba = A##ba; \
|
|||
|
XOReq256(A##ge, De); \
|
|||
|
ROL64in256(Bbe, A##ge, 44); \
|
|||
|
XOReq256(A##ki, Di); \
|
|||
|
ROL64in256(Bbi, A##ki, 43); \
|
|||
|
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
|
|||
|
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
|
|||
|
Ca = E##ba; \
|
|||
|
XOReq256(A##mo, Do); \
|
|||
|
ROL64in256(Bbo, A##mo, 21); \
|
|||
|
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
|
|||
|
Ce = E##be; \
|
|||
|
XOReq256(A##su, Du); \
|
|||
|
ROL64in256(Bbu, A##su, 14); \
|
|||
|
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
|
|||
|
Ci = E##bi; \
|
|||
|
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
|
|||
|
Co = E##bo; \
|
|||
|
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
|
|||
|
Cu = E##bu; \
|
|||
|
\
|
|||
|
XOReq256(A##bo, Do); \
|
|||
|
ROL64in256(Bga, A##bo, 28); \
|
|||
|
XOReq256(A##gu, Du); \
|
|||
|
ROL64in256(Bge, A##gu, 20); \
|
|||
|
XOReq256(A##ka, Da); \
|
|||
|
ROL64in256(Bgi, A##ka, 3); \
|
|||
|
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
|
|||
|
XOReq256(Ca, E##ga); \
|
|||
|
XOReq256(A##me, De); \
|
|||
|
ROL64in256(Bgo, A##me, 45); \
|
|||
|
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
|
|||
|
XOReq256(Ce, E##ge); \
|
|||
|
XOReq256(A##si, Di); \
|
|||
|
ROL64in256(Bgu, A##si, 61); \
|
|||
|
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
|
|||
|
XOReq256(Ci, E##gi); \
|
|||
|
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
|
|||
|
XOReq256(Co, E##go); \
|
|||
|
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
|
|||
|
XOReq256(Cu, E##gu); \
|
|||
|
\
|
|||
|
XOReq256(A##be, De); \
|
|||
|
ROL64in256(Bka, A##be, 1); \
|
|||
|
XOReq256(A##gi, Di); \
|
|||
|
ROL64in256(Bke, A##gi, 6); \
|
|||
|
XOReq256(A##ko, Do); \
|
|||
|
ROL64in256(Bki, A##ko, 25); \
|
|||
|
E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
|
|||
|
XOReq256(Ca, E##ka); \
|
|||
|
XOReq256(A##mu, Du); \
|
|||
|
ROL64in256_8(Bko, A##mu); \
|
|||
|
E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
|
|||
|
XOReq256(Ce, E##ke); \
|
|||
|
XOReq256(A##sa, Da); \
|
|||
|
ROL64in256(Bku, A##sa, 18); \
|
|||
|
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
|
|||
|
XOReq256(Ci, E##ki); \
|
|||
|
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
|
|||
|
XOReq256(Co, E##ko); \
|
|||
|
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
|
|||
|
XOReq256(Cu, E##ku); \
|
|||
|
\
|
|||
|
XOReq256(A##bu, Du); \
|
|||
|
ROL64in256(Bma, A##bu, 27); \
|
|||
|
XOReq256(A##ga, Da); \
|
|||
|
ROL64in256(Bme, A##ga, 36); \
|
|||
|
XOReq256(A##ke, De); \
|
|||
|
ROL64in256(Bmi, A##ke, 10); \
|
|||
|
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
|
|||
|
XOReq256(Ca, E##ma); \
|
|||
|
XOReq256(A##mi, Di); \
|
|||
|
ROL64in256(Bmo, A##mi, 15); \
|
|||
|
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
|
|||
|
XOReq256(Ce, E##me); \
|
|||
|
XOReq256(A##so, Do); \
|
|||
|
ROL64in256_56(Bmu, A##so); \
|
|||
|
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
|
|||
|
XOReq256(Ci, E##mi); \
|
|||
|
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
|
|||
|
XOReq256(Co, E##mo); \
|
|||
|
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
|
|||
|
XOReq256(Cu, E##mu); \
|
|||
|
\
|
|||
|
XOReq256(A##bi, Di); \
|
|||
|
ROL64in256(Bsa, A##bi, 62); \
|
|||
|
XOReq256(A##go, Do); \
|
|||
|
ROL64in256(Bse, A##go, 55); \
|
|||
|
XOReq256(A##ku, Du); \
|
|||
|
ROL64in256(Bsi, A##ku, 39); \
|
|||
|
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
|
|||
|
XOReq256(Ca, E##sa); \
|
|||
|
XOReq256(A##ma, Da); \
|
|||
|
ROL64in256(Bso, A##ma, 41); \
|
|||
|
E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
|
|||
|
XOReq256(Ce, E##se); \
|
|||
|
XOReq256(A##se, De); \
|
|||
|
ROL64in256(Bsu, A##se, 2); \
|
|||
|
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
|
|||
|
XOReq256(Ci, E##si); \
|
|||
|
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
|
|||
|
XOReq256(Co, E##so); \
|
|||
|
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
|
|||
|
XOReq256(Cu, E##su); \
|
|||
|
\
|
|||
|
|
|||
|
/* --- Theta Rho Pi Chi Iota */
|
|||
|
/* --- 64-bit lanes mapped to 64-bit words */
|
|||
|
#define thetaRhoPiChiIota(i, A, E) \
|
|||
|
ROL64in256(Ce1, Ce, 1); \
|
|||
|
Da = XOR256(Cu, Ce1); \
|
|||
|
ROL64in256(Ci1, Ci, 1); \
|
|||
|
De = XOR256(Ca, Ci1); \
|
|||
|
ROL64in256(Co1, Co, 1); \
|
|||
|
Di = XOR256(Ce, Co1); \
|
|||
|
ROL64in256(Cu1, Cu, 1); \
|
|||
|
Do = XOR256(Ci, Cu1); \
|
|||
|
ROL64in256(Ca1, Ca, 1); \
|
|||
|
Du = XOR256(Co, Ca1); \
|
|||
|
\
|
|||
|
XOReq256(A##ba, Da); \
|
|||
|
Bba = A##ba; \
|
|||
|
XOReq256(A##ge, De); \
|
|||
|
ROL64in256(Bbe, A##ge, 44); \
|
|||
|
XOReq256(A##ki, Di); \
|
|||
|
ROL64in256(Bbi, A##ki, 43); \
|
|||
|
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
|
|||
|
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
|
|||
|
XOReq256(A##mo, Do); \
|
|||
|
ROL64in256(Bbo, A##mo, 21); \
|
|||
|
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
|
|||
|
XOReq256(A##su, Du); \
|
|||
|
ROL64in256(Bbu, A##su, 14); \
|
|||
|
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
|
|||
|
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
|
|||
|
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
|
|||
|
\
|
|||
|
XOReq256(A##bo, Do); \
|
|||
|
ROL64in256(Bga, A##bo, 28); \
|
|||
|
XOReq256(A##gu, Du); \
|
|||
|
ROL64in256(Bge, A##gu, 20); \
|
|||
|
XOReq256(A##ka, Da); \
|
|||
|
ROL64in256(Bgi, A##ka, 3); \
|
|||
|
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
|
|||
|
XOReq256(A##me, De); \
|
|||
|
ROL64in256(Bgo, A##me, 45); \
|
|||
|
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
|
|||
|
XOReq256(A##si, Di); \
|
|||
|
ROL64in256(Bgu, A##si, 61); \
|
|||
|
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
|
|||
|
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
|
|||
|
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
|
|||
|
\
|
|||
|
XOReq256(A##be, De); \
|
|||
|
ROL64in256(Bka, A##be, 1); \
|
|||
|
XOReq256(A##gi, Di); \
|
|||
|
ROL64in256(Bke, A##gi, 6); \
|
|||
|
XOReq256(A##ko, Do); \
|
|||
|
ROL64in256(Bki, A##ko, 25); \
|
|||
|
E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
|
|||
|
XOReq256(A##mu, Du); \
|
|||
|
ROL64in256_8(Bko, A##mu); \
|
|||
|
E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
|
|||
|
XOReq256(A##sa, Da); \
|
|||
|
ROL64in256(Bku, A##sa, 18); \
|
|||
|
E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
|
|||
|
E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
|
|||
|
E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
|
|||
|
\
|
|||
|
XOReq256(A##bu, Du); \
|
|||
|
ROL64in256(Bma, A##bu, 27); \
|
|||
|
XOReq256(A##ga, Da); \
|
|||
|
ROL64in256(Bme, A##ga, 36); \
|
|||
|
XOReq256(A##ke, De); \
|
|||
|
ROL64in256(Bmi, A##ke, 10); \
|
|||
|
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
|
|||
|
XOReq256(A##mi, Di); \
|
|||
|
ROL64in256(Bmo, A##mi, 15); \
|
|||
|
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
|
|||
|
XOReq256(A##so, Do); \
|
|||
|
ROL64in256_56(Bmu, A##so); \
|
|||
|
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
|
|||
|
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
|
|||
|
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
|
|||
|
\
|
|||
|
XOReq256(A##bi, Di); \
|
|||
|
ROL64in256(Bsa, A##bi, 62); \
|
|||
|
XOReq256(A##go, Do); \
|
|||
|
ROL64in256(Bse, A##go, 55); \
|
|||
|
XOReq256(A##ku, Du); \
|
|||
|
ROL64in256(Bsi, A##ku, 39); \
|
|||
|
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
|
|||
|
XOReq256(A##ma, Da); \
|
|||
|
ROL64in256(Bso, A##ma, 41); \
|
|||
|
E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
|
|||
|
XOReq256(A##se, De); \
|
|||
|
ROL64in256(Bsu, A##se, 2); \
|
|||
|
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
|
|||
|
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
|
|||
|
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
|
|||
|
\
|
|||
|
|
|||
|
static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
|
|||
|
0x0000000000000001ULL,
|
|||
|
0x0000000000008082ULL,
|
|||
|
0x800000000000808aULL,
|
|||
|
0x8000000080008000ULL,
|
|||
|
0x000000000000808bULL,
|
|||
|
0x0000000080000001ULL,
|
|||
|
0x8000000080008081ULL,
|
|||
|
0x8000000000008009ULL,
|
|||
|
0x000000000000008aULL,
|
|||
|
0x0000000000000088ULL,
|
|||
|
0x0000000080008009ULL,
|
|||
|
0x000000008000000aULL,
|
|||
|
0x000000008000808bULL,
|
|||
|
0x800000000000008bULL,
|
|||
|
0x8000000000008089ULL,
|
|||
|
0x8000000000008003ULL,
|
|||
|
0x8000000000008002ULL,
|
|||
|
0x8000000000000080ULL,
|
|||
|
0x000000000000800aULL,
|
|||
|
0x800000008000000aULL,
|
|||
|
0x8000000080008081ULL,
|
|||
|
0x8000000000008080ULL,
|
|||
|
0x0000000080000001ULL,
|
|||
|
0x8000000080008008ULL};
|
|||
|
|
|||
|
#define copyFromState(X, state) \
|
|||
|
X##ba = LOAD256(state[ 0]); \
|
|||
|
X##be = LOAD256(state[ 1]); \
|
|||
|
X##bi = LOAD256(state[ 2]); \
|
|||
|
X##bo = LOAD256(state[ 3]); \
|
|||
|
X##bu = LOAD256(state[ 4]); \
|
|||
|
X##ga = LOAD256(state[ 5]); \
|
|||
|
X##ge = LOAD256(state[ 6]); \
|
|||
|
X##gi = LOAD256(state[ 7]); \
|
|||
|
X##go = LOAD256(state[ 8]); \
|
|||
|
X##gu = LOAD256(state[ 9]); \
|
|||
|
X##ka = LOAD256(state[10]); \
|
|||
|
X##ke = LOAD256(state[11]); \
|
|||
|
X##ki = LOAD256(state[12]); \
|
|||
|
X##ko = LOAD256(state[13]); \
|
|||
|
X##ku = LOAD256(state[14]); \
|
|||
|
X##ma = LOAD256(state[15]); \
|
|||
|
X##me = LOAD256(state[16]); \
|
|||
|
X##mi = LOAD256(state[17]); \
|
|||
|
X##mo = LOAD256(state[18]); \
|
|||
|
X##mu = LOAD256(state[19]); \
|
|||
|
X##sa = LOAD256(state[20]); \
|
|||
|
X##se = LOAD256(state[21]); \
|
|||
|
X##si = LOAD256(state[22]); \
|
|||
|
X##so = LOAD256(state[23]); \
|
|||
|
X##su = LOAD256(state[24]); \
|
|||
|
|
|||
|
#define copyToState(state, X) \
|
|||
|
STORE256(state[ 0], X##ba); \
|
|||
|
STORE256(state[ 1], X##be); \
|
|||
|
STORE256(state[ 2], X##bi); \
|
|||
|
STORE256(state[ 3], X##bo); \
|
|||
|
STORE256(state[ 4], X##bu); \
|
|||
|
STORE256(state[ 5], X##ga); \
|
|||
|
STORE256(state[ 6], X##ge); \
|
|||
|
STORE256(state[ 7], X##gi); \
|
|||
|
STORE256(state[ 8], X##go); \
|
|||
|
STORE256(state[ 9], X##gu); \
|
|||
|
STORE256(state[10], X##ka); \
|
|||
|
STORE256(state[11], X##ke); \
|
|||
|
STORE256(state[12], X##ki); \
|
|||
|
STORE256(state[13], X##ko); \
|
|||
|
STORE256(state[14], X##ku); \
|
|||
|
STORE256(state[15], X##ma); \
|
|||
|
STORE256(state[16], X##me); \
|
|||
|
STORE256(state[17], X##mi); \
|
|||
|
STORE256(state[18], X##mo); \
|
|||
|
STORE256(state[19], X##mu); \
|
|||
|
STORE256(state[20], X##sa); \
|
|||
|
STORE256(state[21], X##se); \
|
|||
|
STORE256(state[22], X##si); \
|
|||
|
STORE256(state[23], X##so); \
|
|||
|
STORE256(state[24], X##su); \
|
|||
|
|
|||
|
#define copyStateVariables(X, Y) \
|
|||
|
X##ba = Y##ba; \
|
|||
|
X##be = Y##be; \
|
|||
|
X##bi = Y##bi; \
|
|||
|
X##bo = Y##bo; \
|
|||
|
X##bu = Y##bu; \
|
|||
|
X##ga = Y##ga; \
|
|||
|
X##ge = Y##ge; \
|
|||
|
X##gi = Y##gi; \
|
|||
|
X##go = Y##go; \
|
|||
|
X##gu = Y##gu; \
|
|||
|
X##ka = Y##ka; \
|
|||
|
X##ke = Y##ke; \
|
|||
|
X##ki = Y##ki; \
|
|||
|
X##ko = Y##ko; \
|
|||
|
X##ku = Y##ku; \
|
|||
|
X##ma = Y##ma; \
|
|||
|
X##me = Y##me; \
|
|||
|
X##mi = Y##mi; \
|
|||
|
X##mo = Y##mo; \
|
|||
|
X##mu = Y##mu; \
|
|||
|
X##sa = Y##sa; \
|
|||
|
X##se = Y##se; \
|
|||
|
X##si = Y##si; \
|
|||
|
X##so = Y##so; \
|
|||
|
X##su = Y##su; \
|
|||
|
|
|||
|
#ifdef KeccakP1600times4_fullUnrolling
|
|||
|
#define FullUnrolling
|
|||
|
#else
|
|||
|
#define Unrolling KeccakP1600times4_unrolling
|
|||
|
#endif
|
|||
|
#include "KeccakP-1600-unrolling.macros"
|
|||
|
|
|||
|
void KeccakP1600times4_PermuteAll_24rounds(void *states)
|
|||
|
{
|
|||
|
V256 *statesAsLanes = (V256 *)states;
|
|||
|
declareABCDE
|
|||
|
#ifndef KeccakP1600times4_fullUnrolling
|
|||
|
unsigned int i;
|
|||
|
#endif
|
|||
|
|
|||
|
copyFromState(A, statesAsLanes)
|
|||
|
rounds24
|
|||
|
copyToState(statesAsLanes, A)
|
|||
|
}
|
|||
|
|
|||
|
void KeccakP1600times4_PermuteAll_12rounds(void *states)
|
|||
|
{
|
|||
|
V256 *statesAsLanes = (V256 *)states;
|
|||
|
declareABCDE
|
|||
|
#ifndef KeccakP1600times4_fullUnrolling
|
|||
|
unsigned int i;
|
|||
|
#endif
|
|||
|
|
|||
|
copyFromState(A, statesAsLanes)
|
|||
|
rounds12
|
|||
|
copyToState(statesAsLanes, A)
|
|||
|
}
|
|||
|
|
|||
|
size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
|
|||
|
{
|
|||
|
if (laneCount == 21) {
|
|||
|
#if 0
|
|||
|
const unsigned char *dataStart = data;
|
|||
|
const UINT64 *curData0 = (const UINT64 *)data;
|
|||
|
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
|
|||
|
|
|||
|
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
|
|||
|
V256 *stateAsLanes = (V256 *)states;
|
|||
|
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
|
|||
|
#define Xor_In( argIndex ) \
|
|||
|
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
|
|||
|
#define Xor_In4( argIndex ) \
|
|||
|
lanes0 = LOAD256u( curData0[argIndex]),\
|
|||
|
lanes1 = LOAD256u( curData1[argIndex]),\
|
|||
|
lanes2 = LOAD256u( curData2[argIndex]),\
|
|||
|
lanes3 = LOAD256u( curData3[argIndex]),\
|
|||
|
INTLEAVE(),\
|
|||
|
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+3], lanes3 )
|
|||
|
Xor_In4( 0 );
|
|||
|
Xor_In4( 4 );
|
|||
|
Xor_In4( 8 );
|
|||
|
Xor_In4( 12 );
|
|||
|
Xor_In4( 16 );
|
|||
|
Xor_In( 20 );
|
|||
|
#undef Xor_In
|
|||
|
#undef Xor_In4
|
|||
|
KeccakP1600times4_PermuteAll_24rounds(states);
|
|||
|
curData0 += laneOffsetSerial;
|
|||
|
curData1 += laneOffsetSerial;
|
|||
|
curData2 += laneOffsetSerial;
|
|||
|
curData3 += laneOffsetSerial;
|
|||
|
dataByteLen -= laneOffsetSerial*8;
|
|||
|
}
|
|||
|
return (const unsigned char *)curData0 - dataStart;
|
|||
|
#else
|
|||
|
// unsigned int i;
|
|||
|
const unsigned char *dataStart = data;
|
|||
|
const UINT64 *curData0 = (const UINT64 *)data;
|
|||
|
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
|
|||
|
V256 *statesAsLanes = (V256 *)states;
|
|||
|
declareABCDE
|
|||
|
|
|||
|
copyFromState(A, statesAsLanes)
|
|||
|
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
|
|||
|
#define XOR_In( Xxx, argIndex ) \
|
|||
|
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
|
|||
|
XOR_In( Aba, 0 );
|
|||
|
XOR_In( Abe, 1 );
|
|||
|
XOR_In( Abi, 2 );
|
|||
|
XOR_In( Abo, 3 );
|
|||
|
XOR_In( Abu, 4 );
|
|||
|
XOR_In( Aga, 5 );
|
|||
|
XOR_In( Age, 6 );
|
|||
|
XOR_In( Agi, 7 );
|
|||
|
XOR_In( Ago, 8 );
|
|||
|
XOR_In( Agu, 9 );
|
|||
|
XOR_In( Aka, 10 );
|
|||
|
XOR_In( Ake, 11 );
|
|||
|
XOR_In( Aki, 12 );
|
|||
|
XOR_In( Ako, 13 );
|
|||
|
XOR_In( Aku, 14 );
|
|||
|
XOR_In( Ama, 15 );
|
|||
|
XOR_In( Ame, 16 );
|
|||
|
XOR_In( Ami, 17 );
|
|||
|
XOR_In( Amo, 18 );
|
|||
|
XOR_In( Amu, 19 );
|
|||
|
XOR_In( Asa, 20 );
|
|||
|
#undef XOR_In
|
|||
|
rounds24
|
|||
|
curData0 += laneOffsetSerial;
|
|||
|
curData1 += laneOffsetSerial;
|
|||
|
curData2 += laneOffsetSerial;
|
|||
|
curData3 += laneOffsetSerial;
|
|||
|
dataByteLen -= laneOffsetSerial*8;
|
|||
|
}
|
|||
|
copyToState(statesAsLanes, A)
|
|||
|
return (const unsigned char *)curData0 - dataStart;
|
|||
|
#endif
|
|||
|
}
|
|||
|
else {
|
|||
|
// unsigned int i;
|
|||
|
const unsigned char *dataStart = data;
|
|||
|
|
|||
|
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
|
|||
|
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
|
|||
|
KeccakP1600times4_PermuteAll_24rounds(states);
|
|||
|
data += laneOffsetSerial*8;
|
|||
|
dataByteLen -= laneOffsetSerial*8;
|
|||
|
}
|
|||
|
return data - dataStart;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
|
|||
|
{
|
|||
|
if (laneCount == 21) {
|
|||
|
#if 0
|
|||
|
const unsigned char *dataStart = data;
|
|||
|
const UINT64 *curData0 = (const UINT64 *)data;
|
|||
|
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
|
|||
|
|
|||
|
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
|
|||
|
V256 *stateAsLanes = states;
|
|||
|
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
|
|||
|
#define Xor_In( argIndex ) \
|
|||
|
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
|
|||
|
#define Xor_In4( argIndex ) \
|
|||
|
lanes0 = LOAD256u( curData0[argIndex]),\
|
|||
|
lanes1 = LOAD256u( curData1[argIndex]),\
|
|||
|
lanes2 = LOAD256u( curData2[argIndex]),\
|
|||
|
lanes3 = LOAD256u( curData3[argIndex]),\
|
|||
|
INTLEAVE(),\
|
|||
|
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
|
|||
|
XOReq256( stateAsLanes[argIndex+3], lanes3 )
|
|||
|
Xor_In4( 0 );
|
|||
|
Xor_In4( 4 );
|
|||
|
Xor_In4( 8 );
|
|||
|
Xor_In4( 12 );
|
|||
|
Xor_In4( 16 );
|
|||
|
Xor_In( 20 );
|
|||
|
#undef Xor_In
|
|||
|
#undef Xor_In4
|
|||
|
KeccakP1600times4_PermuteAll_12rounds(states);
|
|||
|
curData0 += laneOffsetSerial;
|
|||
|
curData1 += laneOffsetSerial;
|
|||
|
curData2 += laneOffsetSerial;
|
|||
|
curData3 += laneOffsetSerial;
|
|||
|
dataByteLen -= laneOffsetSerial*8;
|
|||
|
}
|
|||
|
return (const unsigned char *)curData0 - dataStart;
|
|||
|
#else
|
|||
|
// unsigned int i;
|
|||
|
const unsigned char *dataStart = data;
|
|||
|
const UINT64 *curData0 = (const UINT64 *)data;
|
|||
|
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
|
|||
|
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
|
|||
|
V256 *statesAsLanes = states;
|
|||
|
declareABCDE
|
|||
|
|
|||
|
copyFromState(A, statesAsLanes)
|
|||
|
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
|
|||
|
#define XOR_In( Xxx, argIndex ) \
|
|||
|
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
|
|||
|
XOR_In( Aba, 0 );
|
|||
|
XOR_In( Abe, 1 );
|
|||
|
XOR_In( Abi, 2 );
|
|||
|
XOR_In( Abo, 3 );
|
|||
|
XOR_In( Abu, 4 );
|
|||
|
XOR_In( Aga, 5 );
|
|||
|
XOR_In( Age, 6 );
|
|||
|
XOR_In( Agi, 7 );
|
|||
|
XOR_In( Ago, 8 );
|
|||
|
XOR_In( Agu, 9 );
|
|||
|
XOR_In( Aka, 10 );
|
|||
|
XOR_In( Ake, 11 );
|
|||
|
XOR_In( Aki, 12 );
|
|||
|
XOR_In( Ako, 13 );
|
|||
|
XOR_In( Aku, 14 );
|
|||
|
XOR_In( Ama, 15 );
|
|||
|
XOR_In( Ame, 16 );
|
|||
|
XOR_In( Ami, 17 );
|
|||
|
XOR_In( Amo, 18 );
|
|||
|
XOR_In( Amu, 19 );
|
|||
|
XOR_In( Asa, 20 );
|
|||
|
#undef XOR_In
|
|||
|
rounds12
|
|||
|
curData0 += laneOffsetSerial;
|
|||
|
curData1 += laneOffsetSerial;
|
|||
|
curData2 += laneOffsetSerial;
|
|||
|
curData3 += laneOffsetSerial;
|
|||
|
dataByteLen -= laneOffsetSerial*8;
|
|||
|
}
|
|||
|
copyToState(statesAsLanes, A)
|
|||
|
return (const unsigned char *)curData0 - dataStart;
|
|||
|
#endif
|
|||
|
}
|
|||
|
else {
|
|||
|
// unsigned int i;
|
|||
|
const unsigned char *dataStart = data;
|
|||
|
|
|||
|
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
|
|||
|
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
|
|||
|
KeccakP1600times4_PermuteAll_12rounds(states);
|
|||
|
data += laneOffsetSerial*8;
|
|||
|
dataByteLen -= laneOffsetSerial*8;
|
|||
|
}
|
|||
|
return data - dataStart;
|
|||
|
}
|
|||
|
}
|