25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1036 lines
39 KiB

  1. /*
  2. Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  3. For more information, feedback or questions, please refer to our website:
  4. https://keccak.team/
  5. To the extent possible under law, the implementer has waived all copyright
  6. and related or neighboring rights to the source code in this file.
  7. http://creativecommons.org/publicdomain/zero/1.0/
  8. ---
  9. This file implements Keccak-p[1600]×4 in a PlSnP-compatible way.
  10. Please refer to PlSnP-documentation.h for more details.
  11. This implementation comes with KeccakP-1600-times4-SnP.h in the same folder.
  12. Please refer to LowLevel.build for the exact list of other files it must be combined with.
  13. */
  14. #include <immintrin.h>
  15. #include <stdint.h>
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include "align.h"
  20. #include "KeccakP-1600-times4-SnP.h"
  21. #include "SIMD256-config.h"
  22. #include "brg_endian.h"
  23. #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
  24. #error Expecting a little-endian platform
  25. #endif
  26. typedef unsigned char UINT8;
  27. typedef unsigned long long int UINT64;
  28. typedef __m128i V128;
  29. typedef __m256i V256;
  30. //#define UseGatherScatter
  31. #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
  32. #if defined(KeccakP1600times4_useAVX2)
  33. #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
  34. #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
  35. #define CONST256_64(a) _mm256_set1_epi64x(a)
  36. #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
  37. #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
  38. #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
  39. #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
  40. #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
  41. #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
  42. static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
  43. static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
  44. #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
  45. #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
  46. #define STORE2_128(ah, al, v) _mm256_storeu2_m128i(&(ah), &(al), v)
  47. #define XOR256(a, b) _mm256_xor_si256(a, b)
  48. #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
  49. #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
  50. #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
  51. #define PERM128( a, b, c ) _mm256_permute2f128_si256((a), (b), c)
  52. #define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
  53. #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
  54. lanesH01 = UNPACKH( lanes0, lanes1 ), \
  55. lanesL23 = UNPACKL( lanes2, lanes3 ), \
  56. lanesH23 = UNPACKH( lanes2, lanes3 ), \
  57. lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
  58. lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
  59. lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
  60. lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
  61. #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
  62. lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
  63. lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
  64. lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
  65. lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
  66. lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
  67. lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
  68. lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
  69. #endif
  70. #define SnP_laneLengthInBytes 8
  71. void KeccakP1600times4_InitializeAll(void *states)
  72. {
  73. memset(states, 0, KeccakP1600times4_statesSizeInBytes);
  74. }
  75. void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
  76. {
  77. unsigned int sizeLeft = length;
  78. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  79. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  80. const unsigned char *curData = data;
  81. UINT64 *statesAsLanes = (UINT64 *)states;
  82. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  83. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  84. UINT64 lane = 0;
  85. if (bytesInLane > sizeLeft)
  86. bytesInLane = sizeLeft;
  87. memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
  88. statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
  89. sizeLeft -= bytesInLane;
  90. lanePosition++;
  91. curData += bytesInLane;
  92. }
  93. while(sizeLeft >= SnP_laneLengthInBytes) {
  94. UINT64 lane = *((const UINT64*)curData);
  95. statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
  96. sizeLeft -= SnP_laneLengthInBytes;
  97. lanePosition++;
  98. curData += SnP_laneLengthInBytes;
  99. }
  100. if (sizeLeft > 0) {
  101. UINT64 lane = 0;
  102. memcpy(&lane, curData, sizeLeft);
  103. statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
  104. }
  105. }
  106. void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
  107. {
  108. V256 *stateAsLanes = (V256 *)states;
  109. unsigned int i;
  110. const UINT64 *curData0 = (const UINT64 *)data;
  111. const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
  112. const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
  113. const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
  114. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  115. #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  116. #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
  117. lanes1 = LOAD256u( curData1[argIndex]),\
  118. lanes2 = LOAD256u( curData2[argIndex]),\
  119. lanes3 = LOAD256u( curData3[argIndex]),\
  120. INTLEAVE(),\
  121. XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
  122. XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
  123. XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
  124. XOReq256( stateAsLanes[argIndex+3], lanes3 )
  125. if ( laneCount >= 16 ) {
  126. Xor_In4( 0 );
  127. Xor_In4( 4 );
  128. Xor_In4( 8 );
  129. Xor_In4( 12 );
  130. if ( laneCount >= 20 ) {
  131. Xor_In4( 16 );
  132. for(i=20; i<laneCount; i++)
  133. Xor_In( i );
  134. }
  135. else {
  136. for(i=16; i<laneCount; i++)
  137. Xor_In( i );
  138. }
  139. }
  140. else {
  141. for(i=0; i<laneCount; i++)
  142. Xor_In( i );
  143. }
  144. #undef Xor_In
  145. #undef Xor_In4
  146. }
  147. void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
  148. {
  149. unsigned int sizeLeft = length;
  150. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  151. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  152. const unsigned char *curData = data;
  153. UINT64 *statesAsLanes = (UINT64 *)states;
  154. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  155. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  156. if (bytesInLane > sizeLeft)
  157. bytesInLane = sizeLeft;
  158. memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
  159. sizeLeft -= bytesInLane;
  160. lanePosition++;
  161. curData += bytesInLane;
  162. }
  163. while(sizeLeft >= SnP_laneLengthInBytes) {
  164. UINT64 lane = *((const UINT64*)curData);
  165. statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
  166. sizeLeft -= SnP_laneLengthInBytes;
  167. lanePosition++;
  168. curData += SnP_laneLengthInBytes;
  169. }
  170. if (sizeLeft > 0) {
  171. memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
  172. }
  173. }
  174. void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
  175. {
  176. V256 *stateAsLanes = (V256 *)states;
  177. unsigned int i;
  178. const UINT64 *curData0 = (const UINT64 *)data;
  179. const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
  180. const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
  181. const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
  182. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  183. #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  184. #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
  185. lanes1 = LOAD256u( curData1[argIndex]),\
  186. lanes2 = LOAD256u( curData2[argIndex]),\
  187. lanes3 = LOAD256u( curData3[argIndex]),\
  188. INTLEAVE(),\
  189. STORE256( stateAsLanes[argIndex+0], lanes0 ),\
  190. STORE256( stateAsLanes[argIndex+1], lanes1 ),\
  191. STORE256( stateAsLanes[argIndex+2], lanes2 ),\
  192. STORE256( stateAsLanes[argIndex+3], lanes3 )
  193. if ( laneCount >= 16 ) {
  194. OverWr4( 0 );
  195. OverWr4( 4 );
  196. OverWr4( 8 );
  197. OverWr4( 12 );
  198. if ( laneCount >= 20 ) {
  199. OverWr4( 16 );
  200. for(i=20; i<laneCount; i++)
  201. OverWr( i );
  202. }
  203. else {
  204. for(i=16; i<laneCount; i++)
  205. OverWr( i );
  206. }
  207. }
  208. else {
  209. for(i=0; i<laneCount; i++)
  210. OverWr( i );
  211. }
  212. #undef OverWr
  213. #undef OverWr4
  214. }
  215. void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
  216. {
  217. unsigned int sizeLeft = byteCount;
  218. unsigned int lanePosition = 0;
  219. UINT64 *statesAsLanes = (UINT64 *)states;
  220. while(sizeLeft >= SnP_laneLengthInBytes) {
  221. statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
  222. sizeLeft -= SnP_laneLengthInBytes;
  223. lanePosition++;
  224. }
  225. if (sizeLeft > 0) {
  226. memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
  227. }
  228. }
  229. void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
  230. {
  231. unsigned int sizeLeft = length;
  232. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  233. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  234. unsigned char *curData = data;
  235. const UINT64 *statesAsLanes = (const UINT64 *)states;
  236. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  237. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  238. if (bytesInLane > sizeLeft)
  239. bytesInLane = sizeLeft;
  240. memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
  241. sizeLeft -= bytesInLane;
  242. lanePosition++;
  243. curData += bytesInLane;
  244. }
  245. while(sizeLeft >= SnP_laneLengthInBytes) {
  246. *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  247. sizeLeft -= SnP_laneLengthInBytes;
  248. lanePosition++;
  249. curData += SnP_laneLengthInBytes;
  250. }
  251. if (sizeLeft > 0) {
  252. memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
  253. }
  254. }
  255. void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
  256. {
  257. UINT64 *curData0 = (UINT64 *)data;
  258. UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
  259. UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
  260. UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
  261. const V256 *stateAsLanes = (const V256 *)states;
  262. const UINT64 *stateAsLanes64 = (const UINT64*)states;
  263. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  264. unsigned int i;
  265. #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
  266. curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
  267. curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
  268. curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
  269. #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
  270. lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
  271. lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
  272. lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
  273. UNINTLEAVE(), \
  274. STORE256u( curData0[argIndex], lanes0 ), \
  275. STORE256u( curData1[argIndex], lanes1 ), \
  276. STORE256u( curData2[argIndex], lanes2 ), \
  277. STORE256u( curData3[argIndex], lanes3 )
  278. if ( laneCount >= 16 ) {
  279. Extr4( 0 );
  280. Extr4( 4 );
  281. Extr4( 8 );
  282. Extr4( 12 );
  283. if ( laneCount >= 20 ) {
  284. Extr4( 16 );
  285. for(i=20; i<laneCount; i++)
  286. Extr( i );
  287. }
  288. else {
  289. for(i=16; i<laneCount; i++)
  290. Extr( i );
  291. }
  292. }
  293. else {
  294. for(i=0; i<laneCount; i++)
  295. Extr( i );
  296. }
  297. #undef Extr
  298. #undef Extr4
  299. }
  300. void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
  301. {
  302. unsigned int sizeLeft = length;
  303. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  304. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  305. const unsigned char *curInput = input;
  306. unsigned char *curOutput = output;
  307. const UINT64 *statesAsLanes = (const UINT64 *)states;
  308. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  309. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  310. UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
  311. if (bytesInLane > sizeLeft)
  312. bytesInLane = sizeLeft;
  313. sizeLeft -= bytesInLane;
  314. do {
  315. *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
  316. lane >>= 8;
  317. } while ( --bytesInLane != 0);
  318. lanePosition++;
  319. }
  320. while(sizeLeft >= SnP_laneLengthInBytes) {
  321. *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  322. sizeLeft -= SnP_laneLengthInBytes;
  323. lanePosition++;
  324. curInput += SnP_laneLengthInBytes;
  325. curOutput += SnP_laneLengthInBytes;
  326. }
  327. if (sizeLeft != 0) {
  328. UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  329. do {
  330. *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
  331. lane >>= 8;
  332. } while ( --sizeLeft != 0);
  333. }
  334. }
  335. void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
  336. {
  337. const UINT64 *curInput0 = (UINT64 *)input;
  338. const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
  339. const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
  340. const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
  341. UINT64 *curOutput0 = (UINT64 *)output;
  342. UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
  343. UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
  344. UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
  345. const V256 *stateAsLanes = (const V256 *)states;
  346. const UINT64 *stateAsLanes64 = (const UINT64*)states;
  347. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  348. unsigned int i;
  349. #define ExtrXor( argIndex ) \
  350. curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
  351. curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
  352. curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
  353. curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
  354. #define ExtrXor4( argIndex ) \
  355. lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
  356. lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
  357. lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
  358. lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
  359. UNINTLEAVE(),\
  360. lanesL01 = LOAD256u( curInput0[argIndex]),\
  361. lanesH01 = LOAD256u( curInput1[argIndex]),\
  362. lanesL23 = LOAD256u( curInput2[argIndex]),\
  363. lanesH23 = LOAD256u( curInput3[argIndex]),\
  364. XOReq256( lanes0, lanesL01 ),\
  365. XOReq256( lanes1, lanesH01 ),\
  366. XOReq256( lanes2, lanesL23 ),\
  367. XOReq256( lanes3, lanesH23 ),\
  368. STORE256u( curOutput0[argIndex], lanes0 ),\
  369. STORE256u( curOutput1[argIndex], lanes1 ),\
  370. STORE256u( curOutput2[argIndex], lanes2 ),\
  371. STORE256u( curOutput3[argIndex], lanes3 )
  372. if ( laneCount >= 16 ) {
  373. ExtrXor4( 0 );
  374. ExtrXor4( 4 );
  375. ExtrXor4( 8 );
  376. ExtrXor4( 12 );
  377. if ( laneCount >= 20 ) {
  378. ExtrXor4( 16 );
  379. for(i=20; i<laneCount; i++)
  380. ExtrXor( i );
  381. }
  382. else {
  383. for(i=16; i<laneCount; i++)
  384. ExtrXor( i );
  385. }
  386. }
  387. else {
  388. for(i=0; i<laneCount; i++)
  389. ExtrXor( i );
  390. }
  391. #undef ExtrXor
  392. #undef ExtrXor4
  393. }
  394. #define declareABCDE \
  395. V256 Aba, Abe, Abi, Abo, Abu; \
  396. V256 Aga, Age, Agi, Ago, Agu; \
  397. V256 Aka, Ake, Aki, Ako, Aku; \
  398. V256 Ama, Ame, Ami, Amo, Amu; \
  399. V256 Asa, Ase, Asi, Aso, Asu; \
  400. V256 Bba, Bbe, Bbi, Bbo, Bbu; \
  401. V256 Bga, Bge, Bgi, Bgo, Bgu; \
  402. V256 Bka, Bke, Bki, Bko, Bku; \
  403. V256 Bma, Bme, Bmi, Bmo, Bmu; \
  404. V256 Bsa, Bse, Bsi, Bso, Bsu; \
  405. V256 Ca, Ce, Ci, Co, Cu; \
  406. V256 Ca1, Ce1, Ci1, Co1, Cu1; \
  407. V256 Da, De, Di, Do, Du; \
  408. V256 Eba, Ebe, Ebi, Ebo, Ebu; \
  409. V256 Ega, Ege, Egi, Ego, Egu; \
  410. V256 Eka, Eke, Eki, Eko, Eku; \
  411. V256 Ema, Eme, Emi, Emo, Emu; \
  412. V256 Esa, Ese, Esi, Eso, Esu; \
  413. #define prepareTheta \
  414. Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
  415. Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
  416. Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
  417. Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
  418. Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
  419. /* --- Theta Rho Pi Chi Iota Prepare-theta */
  420. /* --- 64-bit lanes mapped to 64-bit words */
  421. #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
  422. ROL64in256(Ce1, Ce, 1); \
  423. Da = XOR256(Cu, Ce1); \
  424. ROL64in256(Ci1, Ci, 1); \
  425. De = XOR256(Ca, Ci1); \
  426. ROL64in256(Co1, Co, 1); \
  427. Di = XOR256(Ce, Co1); \
  428. ROL64in256(Cu1, Cu, 1); \
  429. Do = XOR256(Ci, Cu1); \
  430. ROL64in256(Ca1, Ca, 1); \
  431. Du = XOR256(Co, Ca1); \
  432. \
  433. XOReq256(A##ba, Da); \
  434. Bba = A##ba; \
  435. XOReq256(A##ge, De); \
  436. ROL64in256(Bbe, A##ge, 44); \
  437. XOReq256(A##ki, Di); \
  438. ROL64in256(Bbi, A##ki, 43); \
  439. E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
  440. XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
  441. Ca = E##ba; \
  442. XOReq256(A##mo, Do); \
  443. ROL64in256(Bbo, A##mo, 21); \
  444. E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
  445. Ce = E##be; \
  446. XOReq256(A##su, Du); \
  447. ROL64in256(Bbu, A##su, 14); \
  448. E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
  449. Ci = E##bi; \
  450. E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
  451. Co = E##bo; \
  452. E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
  453. Cu = E##bu; \
  454. \
  455. XOReq256(A##bo, Do); \
  456. ROL64in256(Bga, A##bo, 28); \
  457. XOReq256(A##gu, Du); \
  458. ROL64in256(Bge, A##gu, 20); \
  459. XOReq256(A##ka, Da); \
  460. ROL64in256(Bgi, A##ka, 3); \
  461. E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
  462. XOReq256(Ca, E##ga); \
  463. XOReq256(A##me, De); \
  464. ROL64in256(Bgo, A##me, 45); \
  465. E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
  466. XOReq256(Ce, E##ge); \
  467. XOReq256(A##si, Di); \
  468. ROL64in256(Bgu, A##si, 61); \
  469. E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
  470. XOReq256(Ci, E##gi); \
  471. E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
  472. XOReq256(Co, E##go); \
  473. E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
  474. XOReq256(Cu, E##gu); \
  475. \
  476. XOReq256(A##be, De); \
  477. ROL64in256(Bka, A##be, 1); \
  478. XOReq256(A##gi, Di); \
  479. ROL64in256(Bke, A##gi, 6); \
  480. XOReq256(A##ko, Do); \
  481. ROL64in256(Bki, A##ko, 25); \
  482. E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
  483. XOReq256(Ca, E##ka); \
  484. XOReq256(A##mu, Du); \
  485. ROL64in256_8(Bko, A##mu); \
  486. E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
  487. XOReq256(Ce, E##ke); \
  488. XOReq256(A##sa, Da); \
  489. ROL64in256(Bku, A##sa, 18); \
  490. E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
  491. XOReq256(Ci, E##ki); \
  492. E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
  493. XOReq256(Co, E##ko); \
  494. E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
  495. XOReq256(Cu, E##ku); \
  496. \
  497. XOReq256(A##bu, Du); \
  498. ROL64in256(Bma, A##bu, 27); \
  499. XOReq256(A##ga, Da); \
  500. ROL64in256(Bme, A##ga, 36); \
  501. XOReq256(A##ke, De); \
  502. ROL64in256(Bmi, A##ke, 10); \
  503. E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
  504. XOReq256(Ca, E##ma); \
  505. XOReq256(A##mi, Di); \
  506. ROL64in256(Bmo, A##mi, 15); \
  507. E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
  508. XOReq256(Ce, E##me); \
  509. XOReq256(A##so, Do); \
  510. ROL64in256_56(Bmu, A##so); \
  511. E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
  512. XOReq256(Ci, E##mi); \
  513. E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
  514. XOReq256(Co, E##mo); \
  515. E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
  516. XOReq256(Cu, E##mu); \
  517. \
  518. XOReq256(A##bi, Di); \
  519. ROL64in256(Bsa, A##bi, 62); \
  520. XOReq256(A##go, Do); \
  521. ROL64in256(Bse, A##go, 55); \
  522. XOReq256(A##ku, Du); \
  523. ROL64in256(Bsi, A##ku, 39); \
  524. E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
  525. XOReq256(Ca, E##sa); \
  526. XOReq256(A##ma, Da); \
  527. ROL64in256(Bso, A##ma, 41); \
  528. E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
  529. XOReq256(Ce, E##se); \
  530. XOReq256(A##se, De); \
  531. ROL64in256(Bsu, A##se, 2); \
  532. E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
  533. XOReq256(Ci, E##si); \
  534. E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
  535. XOReq256(Co, E##so); \
  536. E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
  537. XOReq256(Cu, E##su); \
  538. \
  539. /* --- Theta Rho Pi Chi Iota */
  540. /* --- 64-bit lanes mapped to 64-bit words */
  541. #define thetaRhoPiChiIota(i, A, E) \
  542. ROL64in256(Ce1, Ce, 1); \
  543. Da = XOR256(Cu, Ce1); \
  544. ROL64in256(Ci1, Ci, 1); \
  545. De = XOR256(Ca, Ci1); \
  546. ROL64in256(Co1, Co, 1); \
  547. Di = XOR256(Ce, Co1); \
  548. ROL64in256(Cu1, Cu, 1); \
  549. Do = XOR256(Ci, Cu1); \
  550. ROL64in256(Ca1, Ca, 1); \
  551. Du = XOR256(Co, Ca1); \
  552. \
  553. XOReq256(A##ba, Da); \
  554. Bba = A##ba; \
  555. XOReq256(A##ge, De); \
  556. ROL64in256(Bbe, A##ge, 44); \
  557. XOReq256(A##ki, Di); \
  558. ROL64in256(Bbi, A##ki, 43); \
  559. E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
  560. XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
  561. XOReq256(A##mo, Do); \
  562. ROL64in256(Bbo, A##mo, 21); \
  563. E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
  564. XOReq256(A##su, Du); \
  565. ROL64in256(Bbu, A##su, 14); \
  566. E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
  567. E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
  568. E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
  569. \
  570. XOReq256(A##bo, Do); \
  571. ROL64in256(Bga, A##bo, 28); \
  572. XOReq256(A##gu, Du); \
  573. ROL64in256(Bge, A##gu, 20); \
  574. XOReq256(A##ka, Da); \
  575. ROL64in256(Bgi, A##ka, 3); \
  576. E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
  577. XOReq256(A##me, De); \
  578. ROL64in256(Bgo, A##me, 45); \
  579. E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
  580. XOReq256(A##si, Di); \
  581. ROL64in256(Bgu, A##si, 61); \
  582. E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
  583. E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
  584. E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
  585. \
  586. XOReq256(A##be, De); \
  587. ROL64in256(Bka, A##be, 1); \
  588. XOReq256(A##gi, Di); \
  589. ROL64in256(Bke, A##gi, 6); \
  590. XOReq256(A##ko, Do); \
  591. ROL64in256(Bki, A##ko, 25); \
  592. E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
  593. XOReq256(A##mu, Du); \
  594. ROL64in256_8(Bko, A##mu); \
  595. E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
  596. XOReq256(A##sa, Da); \
  597. ROL64in256(Bku, A##sa, 18); \
  598. E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
  599. E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
  600. E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
  601. \
  602. XOReq256(A##bu, Du); \
  603. ROL64in256(Bma, A##bu, 27); \
  604. XOReq256(A##ga, Da); \
  605. ROL64in256(Bme, A##ga, 36); \
  606. XOReq256(A##ke, De); \
  607. ROL64in256(Bmi, A##ke, 10); \
  608. E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
  609. XOReq256(A##mi, Di); \
  610. ROL64in256(Bmo, A##mi, 15); \
  611. E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
  612. XOReq256(A##so, Do); \
  613. ROL64in256_56(Bmu, A##so); \
  614. E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
  615. E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
  616. E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
  617. \
  618. XOReq256(A##bi, Di); \
  619. ROL64in256(Bsa, A##bi, 62); \
  620. XOReq256(A##go, Do); \
  621. ROL64in256(Bse, A##go, 55); \
  622. XOReq256(A##ku, Du); \
  623. ROL64in256(Bsi, A##ku, 39); \
  624. E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
  625. XOReq256(A##ma, Da); \
  626. ROL64in256(Bso, A##ma, 41); \
  627. E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
  628. XOReq256(A##se, De); \
  629. ROL64in256(Bsu, A##se, 2); \
  630. E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
  631. E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
  632. E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
  633. \
  634. static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
  635. 0x0000000000000001ULL,
  636. 0x0000000000008082ULL,
  637. 0x800000000000808aULL,
  638. 0x8000000080008000ULL,
  639. 0x000000000000808bULL,
  640. 0x0000000080000001ULL,
  641. 0x8000000080008081ULL,
  642. 0x8000000000008009ULL,
  643. 0x000000000000008aULL,
  644. 0x0000000000000088ULL,
  645. 0x0000000080008009ULL,
  646. 0x000000008000000aULL,
  647. 0x000000008000808bULL,
  648. 0x800000000000008bULL,
  649. 0x8000000000008089ULL,
  650. 0x8000000000008003ULL,
  651. 0x8000000000008002ULL,
  652. 0x8000000000000080ULL,
  653. 0x000000000000800aULL,
  654. 0x800000008000000aULL,
  655. 0x8000000080008081ULL,
  656. 0x8000000000008080ULL,
  657. 0x0000000080000001ULL,
  658. 0x8000000080008008ULL};
  659. #define copyFromState(X, state) \
  660. X##ba = LOAD256(state[ 0]); \
  661. X##be = LOAD256(state[ 1]); \
  662. X##bi = LOAD256(state[ 2]); \
  663. X##bo = LOAD256(state[ 3]); \
  664. X##bu = LOAD256(state[ 4]); \
  665. X##ga = LOAD256(state[ 5]); \
  666. X##ge = LOAD256(state[ 6]); \
  667. X##gi = LOAD256(state[ 7]); \
  668. X##go = LOAD256(state[ 8]); \
  669. X##gu = LOAD256(state[ 9]); \
  670. X##ka = LOAD256(state[10]); \
  671. X##ke = LOAD256(state[11]); \
  672. X##ki = LOAD256(state[12]); \
  673. X##ko = LOAD256(state[13]); \
  674. X##ku = LOAD256(state[14]); \
  675. X##ma = LOAD256(state[15]); \
  676. X##me = LOAD256(state[16]); \
  677. X##mi = LOAD256(state[17]); \
  678. X##mo = LOAD256(state[18]); \
  679. X##mu = LOAD256(state[19]); \
  680. X##sa = LOAD256(state[20]); \
  681. X##se = LOAD256(state[21]); \
  682. X##si = LOAD256(state[22]); \
  683. X##so = LOAD256(state[23]); \
  684. X##su = LOAD256(state[24]); \
  685. #define copyToState(state, X) \
  686. STORE256(state[ 0], X##ba); \
  687. STORE256(state[ 1], X##be); \
  688. STORE256(state[ 2], X##bi); \
  689. STORE256(state[ 3], X##bo); \
  690. STORE256(state[ 4], X##bu); \
  691. STORE256(state[ 5], X##ga); \
  692. STORE256(state[ 6], X##ge); \
  693. STORE256(state[ 7], X##gi); \
  694. STORE256(state[ 8], X##go); \
  695. STORE256(state[ 9], X##gu); \
  696. STORE256(state[10], X##ka); \
  697. STORE256(state[11], X##ke); \
  698. STORE256(state[12], X##ki); \
  699. STORE256(state[13], X##ko); \
  700. STORE256(state[14], X##ku); \
  701. STORE256(state[15], X##ma); \
  702. STORE256(state[16], X##me); \
  703. STORE256(state[17], X##mi); \
  704. STORE256(state[18], X##mo); \
  705. STORE256(state[19], X##mu); \
  706. STORE256(state[20], X##sa); \
  707. STORE256(state[21], X##se); \
  708. STORE256(state[22], X##si); \
  709. STORE256(state[23], X##so); \
  710. STORE256(state[24], X##su); \
  711. #define copyStateVariables(X, Y) \
  712. X##ba = Y##ba; \
  713. X##be = Y##be; \
  714. X##bi = Y##bi; \
  715. X##bo = Y##bo; \
  716. X##bu = Y##bu; \
  717. X##ga = Y##ga; \
  718. X##ge = Y##ge; \
  719. X##gi = Y##gi; \
  720. X##go = Y##go; \
  721. X##gu = Y##gu; \
  722. X##ka = Y##ka; \
  723. X##ke = Y##ke; \
  724. X##ki = Y##ki; \
  725. X##ko = Y##ko; \
  726. X##ku = Y##ku; \
  727. X##ma = Y##ma; \
  728. X##me = Y##me; \
  729. X##mi = Y##mi; \
  730. X##mo = Y##mo; \
  731. X##mu = Y##mu; \
  732. X##sa = Y##sa; \
  733. X##se = Y##se; \
  734. X##si = Y##si; \
  735. X##so = Y##so; \
  736. X##su = Y##su; \
  737. #ifdef KeccakP1600times4_fullUnrolling
  738. #define FullUnrolling
  739. #else
  740. #define Unrolling KeccakP1600times4_unrolling
  741. #endif
  742. #include "KeccakP-1600-unrolling.macros"
  743. void KeccakP1600times4_PermuteAll_24rounds(void *states)
  744. {
  745. V256 *statesAsLanes = (V256 *)states;
  746. declareABCDE
  747. #ifndef KeccakP1600times4_fullUnrolling
  748. unsigned int i;
  749. #endif
  750. copyFromState(A, statesAsLanes)
  751. rounds24
  752. copyToState(statesAsLanes, A)
  753. }
  754. void KeccakP1600times4_PermuteAll_12rounds(void *states)
  755. {
  756. V256 *statesAsLanes = (V256 *)states;
  757. declareABCDE
  758. #ifndef KeccakP1600times4_fullUnrolling
  759. unsigned int i;
  760. #endif
  761. copyFromState(A, statesAsLanes)
  762. rounds12
  763. copyToState(statesAsLanes, A)
  764. }
  765. size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
  766. {
  767. if (laneCount == 21) {
  768. #if 0
  769. const unsigned char *dataStart = data;
  770. const UINT64 *curData0 = (const UINT64 *)data;
  771. const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  772. const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  773. const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  774. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  775. V256 *stateAsLanes = (V256 *)states;
  776. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  777. #define Xor_In( argIndex ) \
  778. XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  779. #define Xor_In4( argIndex ) \
  780. lanes0 = LOAD256u( curData0[argIndex]),\
  781. lanes1 = LOAD256u( curData1[argIndex]),\
  782. lanes2 = LOAD256u( curData2[argIndex]),\
  783. lanes3 = LOAD256u( curData3[argIndex]),\
  784. INTLEAVE(),\
  785. XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
  786. XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
  787. XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
  788. XOReq256( stateAsLanes[argIndex+3], lanes3 )
  789. Xor_In4( 0 );
  790. Xor_In4( 4 );
  791. Xor_In4( 8 );
  792. Xor_In4( 12 );
  793. Xor_In4( 16 );
  794. Xor_In( 20 );
  795. #undef Xor_In
  796. #undef Xor_In4
  797. KeccakP1600times4_PermuteAll_24rounds(states);
  798. curData0 += laneOffsetSerial;
  799. curData1 += laneOffsetSerial;
  800. curData2 += laneOffsetSerial;
  801. curData3 += laneOffsetSerial;
  802. dataByteLen -= laneOffsetSerial*8;
  803. }
  804. return (const unsigned char *)curData0 - dataStart;
  805. #else
  806. // unsigned int i;
  807. const unsigned char *dataStart = data;
  808. const UINT64 *curData0 = (const UINT64 *)data;
  809. const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  810. const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  811. const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  812. V256 *statesAsLanes = (V256 *)states;
  813. declareABCDE
  814. copyFromState(A, statesAsLanes)
  815. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  816. #define XOR_In( Xxx, argIndex ) \
  817. XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  818. XOR_In( Aba, 0 );
  819. XOR_In( Abe, 1 );
  820. XOR_In( Abi, 2 );
  821. XOR_In( Abo, 3 );
  822. XOR_In( Abu, 4 );
  823. XOR_In( Aga, 5 );
  824. XOR_In( Age, 6 );
  825. XOR_In( Agi, 7 );
  826. XOR_In( Ago, 8 );
  827. XOR_In( Agu, 9 );
  828. XOR_In( Aka, 10 );
  829. XOR_In( Ake, 11 );
  830. XOR_In( Aki, 12 );
  831. XOR_In( Ako, 13 );
  832. XOR_In( Aku, 14 );
  833. XOR_In( Ama, 15 );
  834. XOR_In( Ame, 16 );
  835. XOR_In( Ami, 17 );
  836. XOR_In( Amo, 18 );
  837. XOR_In( Amu, 19 );
  838. XOR_In( Asa, 20 );
  839. #undef XOR_In
  840. rounds24
  841. curData0 += laneOffsetSerial;
  842. curData1 += laneOffsetSerial;
  843. curData2 += laneOffsetSerial;
  844. curData3 += laneOffsetSerial;
  845. dataByteLen -= laneOffsetSerial*8;
  846. }
  847. copyToState(statesAsLanes, A)
  848. return (const unsigned char *)curData0 - dataStart;
  849. #endif
  850. }
  851. else {
  852. // unsigned int i;
  853. const unsigned char *dataStart = data;
  854. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  855. KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
  856. KeccakP1600times4_PermuteAll_24rounds(states);
  857. data += laneOffsetSerial*8;
  858. dataByteLen -= laneOffsetSerial*8;
  859. }
  860. return data - dataStart;
  861. }
  862. }
  863. size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
  864. {
  865. if (laneCount == 21) {
  866. #if 0
  867. const unsigned char *dataStart = data;
  868. const UINT64 *curData0 = (const UINT64 *)data;
  869. const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  870. const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  871. const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  872. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  873. V256 *stateAsLanes = states;
  874. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  875. #define Xor_In( argIndex ) \
  876. XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  877. #define Xor_In4( argIndex ) \
  878. lanes0 = LOAD256u( curData0[argIndex]),\
  879. lanes1 = LOAD256u( curData1[argIndex]),\
  880. lanes2 = LOAD256u( curData2[argIndex]),\
  881. lanes3 = LOAD256u( curData3[argIndex]),\
  882. INTLEAVE(),\
  883. XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
  884. XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
  885. XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
  886. XOReq256( stateAsLanes[argIndex+3], lanes3 )
  887. Xor_In4( 0 );
  888. Xor_In4( 4 );
  889. Xor_In4( 8 );
  890. Xor_In4( 12 );
  891. Xor_In4( 16 );
  892. Xor_In( 20 );
  893. #undef Xor_In
  894. #undef Xor_In4
  895. KeccakP1600times4_PermuteAll_12rounds(states);
  896. curData0 += laneOffsetSerial;
  897. curData1 += laneOffsetSerial;
  898. curData2 += laneOffsetSerial;
  899. curData3 += laneOffsetSerial;
  900. dataByteLen -= laneOffsetSerial*8;
  901. }
  902. return (const unsigned char *)curData0 - dataStart;
  903. #else
  904. // unsigned int i;
  905. const unsigned char *dataStart = data;
  906. const UINT64 *curData0 = (const UINT64 *)data;
  907. const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  908. const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  909. const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  910. V256 *statesAsLanes = states;
  911. declareABCDE
  912. copyFromState(A, statesAsLanes)
  913. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  914. #define XOR_In( Xxx, argIndex ) \
  915. XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  916. XOR_In( Aba, 0 );
  917. XOR_In( Abe, 1 );
  918. XOR_In( Abi, 2 );
  919. XOR_In( Abo, 3 );
  920. XOR_In( Abu, 4 );
  921. XOR_In( Aga, 5 );
  922. XOR_In( Age, 6 );
  923. XOR_In( Agi, 7 );
  924. XOR_In( Ago, 8 );
  925. XOR_In( Agu, 9 );
  926. XOR_In( Aka, 10 );
  927. XOR_In( Ake, 11 );
  928. XOR_In( Aki, 12 );
  929. XOR_In( Ako, 13 );
  930. XOR_In( Aku, 14 );
  931. XOR_In( Ama, 15 );
  932. XOR_In( Ame, 16 );
  933. XOR_In( Ami, 17 );
  934. XOR_In( Amo, 18 );
  935. XOR_In( Amu, 19 );
  936. XOR_In( Asa, 20 );
  937. #undef XOR_In
  938. rounds12
  939. curData0 += laneOffsetSerial;
  940. curData1 += laneOffsetSerial;
  941. curData2 += laneOffsetSerial;
  942. curData3 += laneOffsetSerial;
  943. dataByteLen -= laneOffsetSerial*8;
  944. }
  945. copyToState(statesAsLanes, A)
  946. return (const unsigned char *)curData0 - dataStart;
  947. #endif
  948. }
  949. else {
  950. // unsigned int i;
  951. const unsigned char *dataStart = data;
  952. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  953. KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
  954. KeccakP1600times4_PermuteAll_12rounds(states);
  955. data += laneOffsetSerial*8;
  956. dataByteLen -= laneOffsetSerial*8;
  957. }
  958. return data - dataStart;
  959. }
  960. }