Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 
 
 

893 řádky
27 KiB

  1. /* Copyright (c) 2014, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. /* This implementation of poly1305 is by Andrew Moon
  15. * (https://github.com/floodyberry/poly1305-donna) and released as public
  16. * domain. It implements SIMD vectorization based on the algorithm described in
  17. * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
  18. * block size */
  19. #include <openssl/poly1305.h>
  20. #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
  21. #include <emmintrin.h>
  22. #define ALIGN(x) __attribute__((aligned(x)))
  23. /* inline is not a keyword in C89. */
  24. #define INLINE
  25. #define U8TO64_LE(m) (*(uint64_t *)(m))
  26. #define U8TO32_LE(m) (*(uint32_t *)(m))
  27. #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
  28. typedef __m128i xmmi;
  29. typedef unsigned __int128 uint128_t;
  30. static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = {
  31. (1 << 26) - 1, 0, (1 << 26) - 1, 0};
  32. static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
  33. static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0,
  34. (1 << 24), 0};
  35. static uint128_t INLINE add128(uint128_t a, uint128_t b) { return a + b; }
  36. static uint128_t INLINE add128_64(uint128_t a, uint64_t b) { return a + b; }
  37. static uint128_t INLINE mul64x64_128(uint64_t a, uint64_t b) {
  38. return (uint128_t)a * b;
  39. }
  40. static uint64_t INLINE lo128(uint128_t a) { return (uint64_t)a; }
  41. static uint64_t INLINE shr128(uint128_t v, const int shift) {
  42. return (uint64_t)(v >> shift);
  43. }
  44. static uint64_t INLINE shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
  45. return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
  46. }
  47. typedef struct poly1305_power_t {
  48. union {
  49. xmmi v;
  50. uint64_t u[2];
  51. uint32_t d[4];
  52. } R20, R21, R22, R23, R24, S21, S22, S23, S24;
  53. } poly1305_power;
  54. typedef struct poly1305_state_internal_t {
  55. poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
  56. bytes of free storage */
  57. union {
  58. xmmi H[5]; /* 80 bytes */
  59. uint64_t HH[10];
  60. };
  61. /* uint64_t r0,r1,r2; [24 bytes] */
  62. /* uint64_t pad0,pad1; [16 bytes] */
  63. uint64_t started; /* 8 bytes */
  64. uint64_t leftover; /* 8 bytes */
  65. uint8_t buffer[64]; /* 64 bytes */
  66. } poly1305_state_internal; /* 448 bytes total + 63 bytes for
  67. alignment = 511 bytes raw */
  68. static poly1305_state_internal INLINE *poly1305_aligned_state(
  69. poly1305_state *state) {
  70. return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
  71. }
  72. /* copy 0-63 bytes */
  73. static void INLINE
  74. poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
  75. size_t offset = src - dst;
  76. if (bytes & 32) {
  77. _mm_storeu_si128((xmmi *)(dst + 0),
  78. _mm_loadu_si128((xmmi *)(dst + offset + 0)));
  79. _mm_storeu_si128((xmmi *)(dst + 16),
  80. _mm_loadu_si128((xmmi *)(dst + offset + 16)));
  81. dst += 32;
  82. }
  83. if (bytes & 16) {
  84. _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset)));
  85. dst += 16;
  86. }
  87. if (bytes & 8) {
  88. *(uint64_t *)dst = *(uint64_t *)(dst + offset);
  89. dst += 8;
  90. }
  91. if (bytes & 4) {
  92. *(uint32_t *)dst = *(uint32_t *)(dst + offset);
  93. dst += 4;
  94. }
  95. if (bytes & 2) {
  96. *(uint16_t *)dst = *(uint16_t *)(dst + offset);
  97. dst += 2;
  98. }
  99. if (bytes & 1) {
  100. *(uint8_t *)dst = *(uint8_t *)(dst + offset);
  101. }
  102. }
  103. /* zero 0-15 bytes */
  104. static void INLINE poly1305_block_zero(uint8_t *dst, size_t bytes) {
  105. if (bytes & 8) {
  106. *(uint64_t *)dst = 0;
  107. dst += 8;
  108. }
  109. if (bytes & 4) {
  110. *(uint32_t *)dst = 0;
  111. dst += 4;
  112. }
  113. if (bytes & 2) {
  114. *(uint16_t *)dst = 0;
  115. dst += 2;
  116. }
  117. if (bytes & 1) {
  118. *(uint8_t *)dst = 0;
  119. }
  120. }
  121. static size_t INLINE poly1305_min(size_t a, size_t b) {
  122. return (a < b) ? a : b;
  123. }
  124. void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
  125. poly1305_state_internal *st = poly1305_aligned_state(state);
  126. poly1305_power *p;
  127. uint64_t r0, r1, r2;
  128. uint64_t t0, t1;
  129. /* clamp key */
  130. t0 = U8TO64_LE(key + 0);
  131. t1 = U8TO64_LE(key + 8);
  132. r0 = t0 & 0xffc0fffffff;
  133. t0 >>= 44;
  134. t0 |= t1 << 20;
  135. r1 = t0 & 0xfffffc0ffff;
  136. t1 >>= 24;
  137. r2 = t1 & 0x00ffffffc0f;
  138. /* store r in un-used space of st->P[1] */
  139. p = &st->P[1];
  140. p->R20.d[1] = (uint32_t)(r0);
  141. p->R20.d[3] = (uint32_t)(r0 >> 32);
  142. p->R21.d[1] = (uint32_t)(r1);
  143. p->R21.d[3] = (uint32_t)(r1 >> 32);
  144. p->R22.d[1] = (uint32_t)(r2);
  145. p->R22.d[3] = (uint32_t)(r2 >> 32);
  146. /* store pad */
  147. p->R23.d[1] = U8TO32_LE(key + 16);
  148. p->R23.d[3] = U8TO32_LE(key + 20);
  149. p->R24.d[1] = U8TO32_LE(key + 24);
  150. p->R24.d[3] = U8TO32_LE(key + 28);
  151. /* H = 0 */
  152. st->H[0] = _mm_setzero_si128();
  153. st->H[1] = _mm_setzero_si128();
  154. st->H[2] = _mm_setzero_si128();
  155. st->H[3] = _mm_setzero_si128();
  156. st->H[4] = _mm_setzero_si128();
  157. st->started = 0;
  158. st->leftover = 0;
  159. }
  160. static void poly1305_first_block(poly1305_state_internal *st,
  161. const uint8_t *m) {
  162. const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
  163. const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
  164. const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
  165. xmmi T5, T6;
  166. poly1305_power *p;
  167. uint128_t d[3];
  168. uint64_t r0, r1, r2;
  169. uint64_t r20, r21, r22, s22;
  170. uint64_t pad0, pad1;
  171. uint64_t c;
  172. uint64_t i;
  173. /* pull out stored info */
  174. p = &st->P[1];
  175. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  176. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  177. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  178. pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  179. pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  180. /* compute powers r^2,r^4 */
  181. r20 = r0;
  182. r21 = r1;
  183. r22 = r2;
  184. for (i = 0; i < 2; i++) {
  185. s22 = r22 * (5 << 2);
  186. d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
  187. d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
  188. d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
  189. r20 = lo128(d[0]) & 0xfffffffffff;
  190. c = shr128(d[0], 44);
  191. d[1] = add128_64(d[1], c);
  192. r21 = lo128(d[1]) & 0xfffffffffff;
  193. c = shr128(d[1], 44);
  194. d[2] = add128_64(d[2], c);
  195. r22 = lo128(d[2]) & 0x3ffffffffff;
  196. c = shr128(d[2], 42);
  197. r20 += c * 5;
  198. c = (r20 >> 44);
  199. r20 = r20 & 0xfffffffffff;
  200. r21 += c;
  201. p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
  202. _MM_SHUFFLE(1, 0, 1, 0));
  203. p->R21.v = _mm_shuffle_epi32(
  204. _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
  205. _MM_SHUFFLE(1, 0, 1, 0));
  206. p->R22.v =
  207. _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
  208. _MM_SHUFFLE(1, 0, 1, 0));
  209. p->R23.v = _mm_shuffle_epi32(
  210. _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
  211. _MM_SHUFFLE(1, 0, 1, 0));
  212. p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
  213. _MM_SHUFFLE(1, 0, 1, 0));
  214. p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
  215. p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
  216. p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
  217. p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
  218. p--;
  219. }
  220. /* put saved info back */
  221. p = &st->P[1];
  222. p->R20.d[1] = (uint32_t)(r0);
  223. p->R20.d[3] = (uint32_t)(r0 >> 32);
  224. p->R21.d[1] = (uint32_t)(r1);
  225. p->R21.d[3] = (uint32_t)(r1 >> 32);
  226. p->R22.d[1] = (uint32_t)(r2);
  227. p->R22.d[3] = (uint32_t)(r2 >> 32);
  228. p->R23.d[1] = (uint32_t)(pad0);
  229. p->R23.d[3] = (uint32_t)(pad0 >> 32);
  230. p->R24.d[1] = (uint32_t)(pad1);
  231. p->R24.d[3] = (uint32_t)(pad1 >> 32);
  232. /* H = [Mx,My] */
  233. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
  234. _mm_loadl_epi64((xmmi *)(m + 16)));
  235. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
  236. _mm_loadl_epi64((xmmi *)(m + 24)));
  237. st->H[0] = _mm_and_si128(MMASK, T5);
  238. st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  239. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  240. st->H[2] = _mm_and_si128(MMASK, T5);
  241. st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  242. st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  243. }
  244. static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
  245. size_t bytes) {
  246. const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
  247. const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
  248. const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
  249. poly1305_power *p;
  250. xmmi H0, H1, H2, H3, H4;
  251. xmmi T0, T1, T2, T3, T4, T5, T6;
  252. xmmi M0, M1, M2, M3, M4;
  253. xmmi C1, C2;
  254. H0 = st->H[0];
  255. H1 = st->H[1];
  256. H2 = st->H[2];
  257. H3 = st->H[3];
  258. H4 = st->H[4];
  259. while (bytes >= 64) {
  260. /* H *= [r^4,r^4] */
  261. p = &st->P[0];
  262. T0 = _mm_mul_epu32(H0, p->R20.v);
  263. T1 = _mm_mul_epu32(H0, p->R21.v);
  264. T2 = _mm_mul_epu32(H0, p->R22.v);
  265. T3 = _mm_mul_epu32(H0, p->R23.v);
  266. T4 = _mm_mul_epu32(H0, p->R24.v);
  267. T5 = _mm_mul_epu32(H1, p->S24.v);
  268. T6 = _mm_mul_epu32(H1, p->R20.v);
  269. T0 = _mm_add_epi64(T0, T5);
  270. T1 = _mm_add_epi64(T1, T6);
  271. T5 = _mm_mul_epu32(H2, p->S23.v);
  272. T6 = _mm_mul_epu32(H2, p->S24.v);
  273. T0 = _mm_add_epi64(T0, T5);
  274. T1 = _mm_add_epi64(T1, T6);
  275. T5 = _mm_mul_epu32(H3, p->S22.v);
  276. T6 = _mm_mul_epu32(H3, p->S23.v);
  277. T0 = _mm_add_epi64(T0, T5);
  278. T1 = _mm_add_epi64(T1, T6);
  279. T5 = _mm_mul_epu32(H4, p->S21.v);
  280. T6 = _mm_mul_epu32(H4, p->S22.v);
  281. T0 = _mm_add_epi64(T0, T5);
  282. T1 = _mm_add_epi64(T1, T6);
  283. T5 = _mm_mul_epu32(H1, p->R21.v);
  284. T6 = _mm_mul_epu32(H1, p->R22.v);
  285. T2 = _mm_add_epi64(T2, T5);
  286. T3 = _mm_add_epi64(T3, T6);
  287. T5 = _mm_mul_epu32(H2, p->R20.v);
  288. T6 = _mm_mul_epu32(H2, p->R21.v);
  289. T2 = _mm_add_epi64(T2, T5);
  290. T3 = _mm_add_epi64(T3, T6);
  291. T5 = _mm_mul_epu32(H3, p->S24.v);
  292. T6 = _mm_mul_epu32(H3, p->R20.v);
  293. T2 = _mm_add_epi64(T2, T5);
  294. T3 = _mm_add_epi64(T3, T6);
  295. T5 = _mm_mul_epu32(H4, p->S23.v);
  296. T6 = _mm_mul_epu32(H4, p->S24.v);
  297. T2 = _mm_add_epi64(T2, T5);
  298. T3 = _mm_add_epi64(T3, T6);
  299. T5 = _mm_mul_epu32(H1, p->R23.v);
  300. T4 = _mm_add_epi64(T4, T5);
  301. T5 = _mm_mul_epu32(H2, p->R22.v);
  302. T4 = _mm_add_epi64(T4, T5);
  303. T5 = _mm_mul_epu32(H3, p->R21.v);
  304. T4 = _mm_add_epi64(T4, T5);
  305. T5 = _mm_mul_epu32(H4, p->R20.v);
  306. T4 = _mm_add_epi64(T4, T5);
  307. /* H += [Mx,My]*[r^2,r^2] */
  308. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
  309. _mm_loadl_epi64((xmmi *)(m + 16)));
  310. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
  311. _mm_loadl_epi64((xmmi *)(m + 24)));
  312. M0 = _mm_and_si128(MMASK, T5);
  313. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  314. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  315. M2 = _mm_and_si128(MMASK, T5);
  316. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  317. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  318. p = &st->P[1];
  319. T5 = _mm_mul_epu32(M0, p->R20.v);
  320. T6 = _mm_mul_epu32(M0, p->R21.v);
  321. T0 = _mm_add_epi64(T0, T5);
  322. T1 = _mm_add_epi64(T1, T6);
  323. T5 = _mm_mul_epu32(M1, p->S24.v);
  324. T6 = _mm_mul_epu32(M1, p->R20.v);
  325. T0 = _mm_add_epi64(T0, T5);
  326. T1 = _mm_add_epi64(T1, T6);
  327. T5 = _mm_mul_epu32(M2, p->S23.v);
  328. T6 = _mm_mul_epu32(M2, p->S24.v);
  329. T0 = _mm_add_epi64(T0, T5);
  330. T1 = _mm_add_epi64(T1, T6);
  331. T5 = _mm_mul_epu32(M3, p->S22.v);
  332. T6 = _mm_mul_epu32(M3, p->S23.v);
  333. T0 = _mm_add_epi64(T0, T5);
  334. T1 = _mm_add_epi64(T1, T6);
  335. T5 = _mm_mul_epu32(M4, p->S21.v);
  336. T6 = _mm_mul_epu32(M4, p->S22.v);
  337. T0 = _mm_add_epi64(T0, T5);
  338. T1 = _mm_add_epi64(T1, T6);
  339. T5 = _mm_mul_epu32(M0, p->R22.v);
  340. T6 = _mm_mul_epu32(M0, p->R23.v);
  341. T2 = _mm_add_epi64(T2, T5);
  342. T3 = _mm_add_epi64(T3, T6);
  343. T5 = _mm_mul_epu32(M1, p->R21.v);
  344. T6 = _mm_mul_epu32(M1, p->R22.v);
  345. T2 = _mm_add_epi64(T2, T5);
  346. T3 = _mm_add_epi64(T3, T6);
  347. T5 = _mm_mul_epu32(M2, p->R20.v);
  348. T6 = _mm_mul_epu32(M2, p->R21.v);
  349. T2 = _mm_add_epi64(T2, T5);
  350. T3 = _mm_add_epi64(T3, T6);
  351. T5 = _mm_mul_epu32(M3, p->S24.v);
  352. T6 = _mm_mul_epu32(M3, p->R20.v);
  353. T2 = _mm_add_epi64(T2, T5);
  354. T3 = _mm_add_epi64(T3, T6);
  355. T5 = _mm_mul_epu32(M4, p->S23.v);
  356. T6 = _mm_mul_epu32(M4, p->S24.v);
  357. T2 = _mm_add_epi64(T2, T5);
  358. T3 = _mm_add_epi64(T3, T6);
  359. T5 = _mm_mul_epu32(M0, p->R24.v);
  360. T4 = _mm_add_epi64(T4, T5);
  361. T5 = _mm_mul_epu32(M1, p->R23.v);
  362. T4 = _mm_add_epi64(T4, T5);
  363. T5 = _mm_mul_epu32(M2, p->R22.v);
  364. T4 = _mm_add_epi64(T4, T5);
  365. T5 = _mm_mul_epu32(M3, p->R21.v);
  366. T4 = _mm_add_epi64(T4, T5);
  367. T5 = _mm_mul_epu32(M4, p->R20.v);
  368. T4 = _mm_add_epi64(T4, T5);
  369. /* H += [Mx,My] */
  370. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)),
  371. _mm_loadl_epi64((xmmi *)(m + 48)));
  372. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)),
  373. _mm_loadl_epi64((xmmi *)(m + 56)));
  374. M0 = _mm_and_si128(MMASK, T5);
  375. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  376. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  377. M2 = _mm_and_si128(MMASK, T5);
  378. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  379. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  380. T0 = _mm_add_epi64(T0, M0);
  381. T1 = _mm_add_epi64(T1, M1);
  382. T2 = _mm_add_epi64(T2, M2);
  383. T3 = _mm_add_epi64(T3, M3);
  384. T4 = _mm_add_epi64(T4, M4);
  385. /* reduce */
  386. C1 = _mm_srli_epi64(T0, 26);
  387. C2 = _mm_srli_epi64(T3, 26);
  388. T0 = _mm_and_si128(T0, MMASK);
  389. T3 = _mm_and_si128(T3, MMASK);
  390. T1 = _mm_add_epi64(T1, C1);
  391. T4 = _mm_add_epi64(T4, C2);
  392. C1 = _mm_srli_epi64(T1, 26);
  393. C2 = _mm_srli_epi64(T4, 26);
  394. T1 = _mm_and_si128(T1, MMASK);
  395. T4 = _mm_and_si128(T4, MMASK);
  396. T2 = _mm_add_epi64(T2, C1);
  397. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  398. C1 = _mm_srli_epi64(T2, 26);
  399. C2 = _mm_srli_epi64(T0, 26);
  400. T2 = _mm_and_si128(T2, MMASK);
  401. T0 = _mm_and_si128(T0, MMASK);
  402. T3 = _mm_add_epi64(T3, C1);
  403. T1 = _mm_add_epi64(T1, C2);
  404. C1 = _mm_srli_epi64(T3, 26);
  405. T3 = _mm_and_si128(T3, MMASK);
  406. T4 = _mm_add_epi64(T4, C1);
  407. /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
  408. H0 = T0;
  409. H1 = T1;
  410. H2 = T2;
  411. H3 = T3;
  412. H4 = T4;
  413. m += 64;
  414. bytes -= 64;
  415. }
  416. st->H[0] = H0;
  417. st->H[1] = H1;
  418. st->H[2] = H2;
  419. st->H[3] = H3;
  420. st->H[4] = H4;
  421. }
  422. static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
  423. size_t bytes) {
  424. const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
  425. const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
  426. const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
  427. poly1305_power *p;
  428. xmmi H0, H1, H2, H3, H4;
  429. xmmi M0, M1, M2, M3, M4;
  430. xmmi T0, T1, T2, T3, T4, T5, T6;
  431. xmmi C1, C2;
  432. uint64_t r0, r1, r2;
  433. uint64_t t0, t1, t2, t3, t4;
  434. uint64_t c;
  435. size_t consumed = 0;
  436. H0 = st->H[0];
  437. H1 = st->H[1];
  438. H2 = st->H[2];
  439. H3 = st->H[3];
  440. H4 = st->H[4];
  441. /* p = [r^2,r^2] */
  442. p = &st->P[1];
  443. if (bytes >= 32) {
  444. /* H *= [r^2,r^2] */
  445. T0 = _mm_mul_epu32(H0, p->R20.v);
  446. T1 = _mm_mul_epu32(H0, p->R21.v);
  447. T2 = _mm_mul_epu32(H0, p->R22.v);
  448. T3 = _mm_mul_epu32(H0, p->R23.v);
  449. T4 = _mm_mul_epu32(H0, p->R24.v);
  450. T5 = _mm_mul_epu32(H1, p->S24.v);
  451. T6 = _mm_mul_epu32(H1, p->R20.v);
  452. T0 = _mm_add_epi64(T0, T5);
  453. T1 = _mm_add_epi64(T1, T6);
  454. T5 = _mm_mul_epu32(H2, p->S23.v);
  455. T6 = _mm_mul_epu32(H2, p->S24.v);
  456. T0 = _mm_add_epi64(T0, T5);
  457. T1 = _mm_add_epi64(T1, T6);
  458. T5 = _mm_mul_epu32(H3, p->S22.v);
  459. T6 = _mm_mul_epu32(H3, p->S23.v);
  460. T0 = _mm_add_epi64(T0, T5);
  461. T1 = _mm_add_epi64(T1, T6);
  462. T5 = _mm_mul_epu32(H4, p->S21.v);
  463. T6 = _mm_mul_epu32(H4, p->S22.v);
  464. T0 = _mm_add_epi64(T0, T5);
  465. T1 = _mm_add_epi64(T1, T6);
  466. T5 = _mm_mul_epu32(H1, p->R21.v);
  467. T6 = _mm_mul_epu32(H1, p->R22.v);
  468. T2 = _mm_add_epi64(T2, T5);
  469. T3 = _mm_add_epi64(T3, T6);
  470. T5 = _mm_mul_epu32(H2, p->R20.v);
  471. T6 = _mm_mul_epu32(H2, p->R21.v);
  472. T2 = _mm_add_epi64(T2, T5);
  473. T3 = _mm_add_epi64(T3, T6);
  474. T5 = _mm_mul_epu32(H3, p->S24.v);
  475. T6 = _mm_mul_epu32(H3, p->R20.v);
  476. T2 = _mm_add_epi64(T2, T5);
  477. T3 = _mm_add_epi64(T3, T6);
  478. T5 = _mm_mul_epu32(H4, p->S23.v);
  479. T6 = _mm_mul_epu32(H4, p->S24.v);
  480. T2 = _mm_add_epi64(T2, T5);
  481. T3 = _mm_add_epi64(T3, T6);
  482. T5 = _mm_mul_epu32(H1, p->R23.v);
  483. T4 = _mm_add_epi64(T4, T5);
  484. T5 = _mm_mul_epu32(H2, p->R22.v);
  485. T4 = _mm_add_epi64(T4, T5);
  486. T5 = _mm_mul_epu32(H3, p->R21.v);
  487. T4 = _mm_add_epi64(T4, T5);
  488. T5 = _mm_mul_epu32(H4, p->R20.v);
  489. T4 = _mm_add_epi64(T4, T5);
  490. /* H += [Mx,My] */
  491. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
  492. _mm_loadl_epi64((xmmi *)(m + 16)));
  493. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
  494. _mm_loadl_epi64((xmmi *)(m + 24)));
  495. M0 = _mm_and_si128(MMASK, T5);
  496. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  497. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  498. M2 = _mm_and_si128(MMASK, T5);
  499. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  500. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  501. T0 = _mm_add_epi64(T0, M0);
  502. T1 = _mm_add_epi64(T1, M1);
  503. T2 = _mm_add_epi64(T2, M2);
  504. T3 = _mm_add_epi64(T3, M3);
  505. T4 = _mm_add_epi64(T4, M4);
  506. /* reduce */
  507. C1 = _mm_srli_epi64(T0, 26);
  508. C2 = _mm_srli_epi64(T3, 26);
  509. T0 = _mm_and_si128(T0, MMASK);
  510. T3 = _mm_and_si128(T3, MMASK);
  511. T1 = _mm_add_epi64(T1, C1);
  512. T4 = _mm_add_epi64(T4, C2);
  513. C1 = _mm_srli_epi64(T1, 26);
  514. C2 = _mm_srli_epi64(T4, 26);
  515. T1 = _mm_and_si128(T1, MMASK);
  516. T4 = _mm_and_si128(T4, MMASK);
  517. T2 = _mm_add_epi64(T2, C1);
  518. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  519. C1 = _mm_srli_epi64(T2, 26);
  520. C2 = _mm_srli_epi64(T0, 26);
  521. T2 = _mm_and_si128(T2, MMASK);
  522. T0 = _mm_and_si128(T0, MMASK);
  523. T3 = _mm_add_epi64(T3, C1);
  524. T1 = _mm_add_epi64(T1, C2);
  525. C1 = _mm_srli_epi64(T3, 26);
  526. T3 = _mm_and_si128(T3, MMASK);
  527. T4 = _mm_add_epi64(T4, C1);
  528. /* H = (H*[r^2,r^2] + [Mx,My]) */
  529. H0 = T0;
  530. H1 = T1;
  531. H2 = T2;
  532. H3 = T3;
  533. H4 = T4;
  534. consumed = 32;
  535. }
  536. /* finalize, H *= [r^2,r] */
  537. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  538. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  539. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  540. p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
  541. p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  542. p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  543. p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  544. p->R24.d[2] = (uint32_t)((r2 >> 16));
  545. p->S21.d[2] = p->R21.d[2] * 5;
  546. p->S22.d[2] = p->R22.d[2] * 5;
  547. p->S23.d[2] = p->R23.d[2] * 5;
  548. p->S24.d[2] = p->R24.d[2] * 5;
  549. /* H *= [r^2,r] */
  550. T0 = _mm_mul_epu32(H0, p->R20.v);
  551. T1 = _mm_mul_epu32(H0, p->R21.v);
  552. T2 = _mm_mul_epu32(H0, p->R22.v);
  553. T3 = _mm_mul_epu32(H0, p->R23.v);
  554. T4 = _mm_mul_epu32(H0, p->R24.v);
  555. T5 = _mm_mul_epu32(H1, p->S24.v);
  556. T6 = _mm_mul_epu32(H1, p->R20.v);
  557. T0 = _mm_add_epi64(T0, T5);
  558. T1 = _mm_add_epi64(T1, T6);
  559. T5 = _mm_mul_epu32(H2, p->S23.v);
  560. T6 = _mm_mul_epu32(H2, p->S24.v);
  561. T0 = _mm_add_epi64(T0, T5);
  562. T1 = _mm_add_epi64(T1, T6);
  563. T5 = _mm_mul_epu32(H3, p->S22.v);
  564. T6 = _mm_mul_epu32(H3, p->S23.v);
  565. T0 = _mm_add_epi64(T0, T5);
  566. T1 = _mm_add_epi64(T1, T6);
  567. T5 = _mm_mul_epu32(H4, p->S21.v);
  568. T6 = _mm_mul_epu32(H4, p->S22.v);
  569. T0 = _mm_add_epi64(T0, T5);
  570. T1 = _mm_add_epi64(T1, T6);
  571. T5 = _mm_mul_epu32(H1, p->R21.v);
  572. T6 = _mm_mul_epu32(H1, p->R22.v);
  573. T2 = _mm_add_epi64(T2, T5);
  574. T3 = _mm_add_epi64(T3, T6);
  575. T5 = _mm_mul_epu32(H2, p->R20.v);
  576. T6 = _mm_mul_epu32(H2, p->R21.v);
  577. T2 = _mm_add_epi64(T2, T5);
  578. T3 = _mm_add_epi64(T3, T6);
  579. T5 = _mm_mul_epu32(H3, p->S24.v);
  580. T6 = _mm_mul_epu32(H3, p->R20.v);
  581. T2 = _mm_add_epi64(T2, T5);
  582. T3 = _mm_add_epi64(T3, T6);
  583. T5 = _mm_mul_epu32(H4, p->S23.v);
  584. T6 = _mm_mul_epu32(H4, p->S24.v);
  585. T2 = _mm_add_epi64(T2, T5);
  586. T3 = _mm_add_epi64(T3, T6);
  587. T5 = _mm_mul_epu32(H1, p->R23.v);
  588. T4 = _mm_add_epi64(T4, T5);
  589. T5 = _mm_mul_epu32(H2, p->R22.v);
  590. T4 = _mm_add_epi64(T4, T5);
  591. T5 = _mm_mul_epu32(H3, p->R21.v);
  592. T4 = _mm_add_epi64(T4, T5);
  593. T5 = _mm_mul_epu32(H4, p->R20.v);
  594. T4 = _mm_add_epi64(T4, T5);
  595. C1 = _mm_srli_epi64(T0, 26);
  596. C2 = _mm_srli_epi64(T3, 26);
  597. T0 = _mm_and_si128(T0, MMASK);
  598. T3 = _mm_and_si128(T3, MMASK);
  599. T1 = _mm_add_epi64(T1, C1);
  600. T4 = _mm_add_epi64(T4, C2);
  601. C1 = _mm_srli_epi64(T1, 26);
  602. C2 = _mm_srli_epi64(T4, 26);
  603. T1 = _mm_and_si128(T1, MMASK);
  604. T4 = _mm_and_si128(T4, MMASK);
  605. T2 = _mm_add_epi64(T2, C1);
  606. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  607. C1 = _mm_srli_epi64(T2, 26);
  608. C2 = _mm_srli_epi64(T0, 26);
  609. T2 = _mm_and_si128(T2, MMASK);
  610. T0 = _mm_and_si128(T0, MMASK);
  611. T3 = _mm_add_epi64(T3, C1);
  612. T1 = _mm_add_epi64(T1, C2);
  613. C1 = _mm_srli_epi64(T3, 26);
  614. T3 = _mm_and_si128(T3, MMASK);
  615. T4 = _mm_add_epi64(T4, C1);
  616. /* H = H[0]+H[1] */
  617. H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  618. H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  619. H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  620. H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  621. H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
  622. t0 = _mm_cvtsi128_si32(H0);
  623. c = (t0 >> 26);
  624. t0 &= 0x3ffffff;
  625. t1 = _mm_cvtsi128_si32(H1) + c;
  626. c = (t1 >> 26);
  627. t1 &= 0x3ffffff;
  628. t2 = _mm_cvtsi128_si32(H2) + c;
  629. c = (t2 >> 26);
  630. t2 &= 0x3ffffff;
  631. t3 = _mm_cvtsi128_si32(H3) + c;
  632. c = (t3 >> 26);
  633. t3 &= 0x3ffffff;
  634. t4 = _mm_cvtsi128_si32(H4) + c;
  635. c = (t4 >> 26);
  636. t4 &= 0x3ffffff;
  637. t0 = t0 + (c * 5);
  638. c = (t0 >> 26);
  639. t0 &= 0x3ffffff;
  640. t1 = t1 + c;
  641. st->HH[0] = ((t0) | (t1 << 26)) & 0xfffffffffffull;
  642. st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull;
  643. st->HH[2] = ((t3 >> 10) | (t4 << 16)) & 0x3ffffffffffull;
  644. return consumed;
  645. }
  646. void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
  647. size_t bytes) {
  648. poly1305_state_internal *st = poly1305_aligned_state(state);
  649. size_t want;
  650. /* need at least 32 initial bytes to start the accelerated branch */
  651. if (!st->started) {
  652. if ((st->leftover == 0) && (bytes > 32)) {
  653. poly1305_first_block(st, m);
  654. m += 32;
  655. bytes -= 32;
  656. } else {
  657. want = poly1305_min(32 - st->leftover, bytes);
  658. poly1305_block_copy(st->buffer + st->leftover, m, want);
  659. bytes -= want;
  660. m += want;
  661. st->leftover += want;
  662. if ((st->leftover < 32) || (bytes == 0)) {
  663. return;
  664. }
  665. poly1305_first_block(st, st->buffer);
  666. st->leftover = 0;
  667. }
  668. st->started = 1;
  669. }
  670. /* handle leftover */
  671. if (st->leftover) {
  672. want = poly1305_min(64 - st->leftover, bytes);
  673. poly1305_block_copy(st->buffer + st->leftover, m, want);
  674. bytes -= want;
  675. m += want;
  676. st->leftover += want;
  677. if (st->leftover < 64) {
  678. return;
  679. }
  680. poly1305_blocks(st, st->buffer, 64);
  681. st->leftover = 0;
  682. }
  683. /* process 64 byte blocks */
  684. if (bytes >= 64) {
  685. want = (bytes & ~63);
  686. poly1305_blocks(st, m, want);
  687. m += want;
  688. bytes -= want;
  689. }
  690. if (bytes) {
  691. poly1305_block_copy(st->buffer + st->leftover, m, bytes);
  692. st->leftover += bytes;
  693. }
  694. }
  695. void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
  696. poly1305_state_internal *st = poly1305_aligned_state(state);
  697. size_t leftover = st->leftover;
  698. uint8_t *m = st->buffer;
  699. uint128_t d[3];
  700. uint64_t h0, h1, h2;
  701. uint64_t t0, t1;
  702. uint64_t g0, g1, g2, c, nc;
  703. uint64_t r0, r1, r2, s1, s2;
  704. poly1305_power *p;
  705. if (st->started) {
  706. size_t consumed = poly1305_combine(st, m, leftover);
  707. leftover -= consumed;
  708. m += consumed;
  709. }
  710. /* st->HH will either be 0 or have the combined result */
  711. h0 = st->HH[0];
  712. h1 = st->HH[1];
  713. h2 = st->HH[2];
  714. p = &st->P[1];
  715. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  716. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  717. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  718. s1 = r1 * (5 << 2);
  719. s2 = r2 * (5 << 2);
  720. if (leftover < 16) {
  721. goto poly1305_donna_atmost15bytes;
  722. }
  723. poly1305_donna_atleast16bytes:
  724. t0 = U8TO64_LE(m + 0);
  725. t1 = U8TO64_LE(m + 8);
  726. h0 += t0 & 0xfffffffffff;
  727. t0 = shr128_pair(t1, t0, 44);
  728. h1 += t0 & 0xfffffffffff;
  729. h2 += (t1 >> 24) | ((uint64_t)1 << 40);
  730. poly1305_donna_mul:
  731. d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
  732. mul64x64_128(h2, s1));
  733. d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
  734. mul64x64_128(h2, s2));
  735. d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
  736. mul64x64_128(h2, r0));
  737. h0 = lo128(d[0]) & 0xfffffffffff;
  738. c = shr128(d[0], 44);
  739. d[1] = add128_64(d[1], c);
  740. h1 = lo128(d[1]) & 0xfffffffffff;
  741. c = shr128(d[1], 44);
  742. d[2] = add128_64(d[2], c);
  743. h2 = lo128(d[2]) & 0x3ffffffffff;
  744. c = shr128(d[2], 42);
  745. h0 += c * 5;
  746. m += 16;
  747. leftover -= 16;
  748. if (leftover >= 16) {
  749. goto poly1305_donna_atleast16bytes;
  750. }
  751. /* final bytes */
  752. poly1305_donna_atmost15bytes:
  753. if (!leftover) {
  754. goto poly1305_donna_finish;
  755. }
  756. m[leftover++] = 1;
  757. poly1305_block_zero(m + leftover, 16 - leftover);
  758. leftover = 16;
  759. t0 = U8TO64_LE(m + 0);
  760. t1 = U8TO64_LE(m + 8);
  761. h0 += t0 & 0xfffffffffff;
  762. t0 = shr128_pair(t1, t0, 44);
  763. h1 += t0 & 0xfffffffffff;
  764. h2 += (t1 >> 24);
  765. goto poly1305_donna_mul;
  766. poly1305_donna_finish:
  767. c = (h0 >> 44);
  768. h0 &= 0xfffffffffff;
  769. h1 += c;
  770. c = (h1 >> 44);
  771. h1 &= 0xfffffffffff;
  772. h2 += c;
  773. c = (h2 >> 42);
  774. h2 &= 0x3ffffffffff;
  775. h0 += c * 5;
  776. g0 = h0 + 5;
  777. c = (g0 >> 44);
  778. g0 &= 0xfffffffffff;
  779. g1 = h1 + c;
  780. c = (g1 >> 44);
  781. g1 &= 0xfffffffffff;
  782. g2 = h2 + c - ((uint64_t)1 << 42);
  783. c = (g2 >> 63) - 1;
  784. nc = ~c;
  785. h0 = (h0 & nc) | (g0 & c);
  786. h1 = (h1 & nc) | (g1 & c);
  787. h2 = (h2 & nc) | (g2 & c);
  788. /* pad */
  789. t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  790. t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  791. h0 += (t0 & 0xfffffffffff);
  792. c = (h0 >> 44);
  793. h0 &= 0xfffffffffff;
  794. t0 = shr128_pair(t1, t0, 44);
  795. h1 += (t0 & 0xfffffffffff) + c;
  796. c = (h1 >> 44);
  797. h1 &= 0xfffffffffff;
  798. t1 = (t1 >> 24);
  799. h2 += (t1)+c;
  800. U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
  801. U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
  802. }
  803. #endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */