Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

840 lignes
26 KiB

  1. /* Copyright (c) 2014, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. // This implementation of poly1305 is by Andrew Moon
  15. // (https://github.com/floodyberry/poly1305-donna) and released as public
  16. // domain. It implements SIMD vectorization based on the algorithm described in
  17. // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
  18. // block size
  19. #include <openssl/poly1305.h>
  20. #include "../internal.h"
  21. #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
  22. #include <emmintrin.h>
  23. #define U8TO64_LE(m) (*(const uint64_t *)(m))
  24. #define U8TO32_LE(m) (*(const uint32_t *)(m))
  25. #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
  26. typedef __m128i xmmi;
  27. static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
  28. (1 << 26) - 1, 0, (1 << 26) - 1, 0};
  29. static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
  30. static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
  31. (1 << 24), 0, (1 << 24), 0};
  32. static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
  33. static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
  34. static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
  35. return (uint128_t)a * b;
  36. }
  37. static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
  38. static inline uint64_t shr128(uint128_t v, const int shift) {
  39. return (uint64_t)(v >> shift);
  40. }
  41. static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
  42. return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
  43. }
  44. typedef struct poly1305_power_t {
  45. union {
  46. xmmi v;
  47. uint64_t u[2];
  48. uint32_t d[4];
  49. } R20, R21, R22, R23, R24, S21, S22, S23, S24;
  50. } poly1305_power;
  51. typedef struct poly1305_state_internal_t {
  52. poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
  53. bytes of free storage */
  54. union {
  55. xmmi H[5]; // 80 bytes
  56. uint64_t HH[10];
  57. };
  58. // uint64_t r0,r1,r2; [24 bytes]
  59. // uint64_t pad0,pad1; [16 bytes]
  60. uint64_t started; // 8 bytes
  61. uint64_t leftover; // 8 bytes
  62. uint8_t buffer[64]; // 64 bytes
  63. } poly1305_state_internal; /* 448 bytes total + 63 bytes for
  64. alignment = 511 bytes raw */
  65. static inline poly1305_state_internal *poly1305_aligned_state(
  66. poly1305_state *state) {
  67. return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
  68. }
  69. static inline size_t poly1305_min(size_t a, size_t b) {
  70. return (a < b) ? a : b;
  71. }
  72. void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
  73. poly1305_state_internal *st = poly1305_aligned_state(state);
  74. poly1305_power *p;
  75. uint64_t r0, r1, r2;
  76. uint64_t t0, t1;
  77. // clamp key
  78. t0 = U8TO64_LE(key + 0);
  79. t1 = U8TO64_LE(key + 8);
  80. r0 = t0 & 0xffc0fffffff;
  81. t0 >>= 44;
  82. t0 |= t1 << 20;
  83. r1 = t0 & 0xfffffc0ffff;
  84. t1 >>= 24;
  85. r2 = t1 & 0x00ffffffc0f;
  86. // store r in un-used space of st->P[1]
  87. p = &st->P[1];
  88. p->R20.d[1] = (uint32_t)(r0);
  89. p->R20.d[3] = (uint32_t)(r0 >> 32);
  90. p->R21.d[1] = (uint32_t)(r1);
  91. p->R21.d[3] = (uint32_t)(r1 >> 32);
  92. p->R22.d[1] = (uint32_t)(r2);
  93. p->R22.d[3] = (uint32_t)(r2 >> 32);
  94. // store pad
  95. p->R23.d[1] = U8TO32_LE(key + 16);
  96. p->R23.d[3] = U8TO32_LE(key + 20);
  97. p->R24.d[1] = U8TO32_LE(key + 24);
  98. p->R24.d[3] = U8TO32_LE(key + 28);
  99. // H = 0
  100. st->H[0] = _mm_setzero_si128();
  101. st->H[1] = _mm_setzero_si128();
  102. st->H[2] = _mm_setzero_si128();
  103. st->H[3] = _mm_setzero_si128();
  104. st->H[4] = _mm_setzero_si128();
  105. st->started = 0;
  106. st->leftover = 0;
  107. }
  108. static void poly1305_first_block(poly1305_state_internal *st,
  109. const uint8_t *m) {
  110. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  111. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  112. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  113. xmmi T5, T6;
  114. poly1305_power *p;
  115. uint128_t d[3];
  116. uint64_t r0, r1, r2;
  117. uint64_t r20, r21, r22, s22;
  118. uint64_t pad0, pad1;
  119. uint64_t c;
  120. uint64_t i;
  121. // pull out stored info
  122. p = &st->P[1];
  123. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  124. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  125. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  126. pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  127. pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  128. // compute powers r^2,r^4
  129. r20 = r0;
  130. r21 = r1;
  131. r22 = r2;
  132. for (i = 0; i < 2; i++) {
  133. s22 = r22 * (5 << 2);
  134. d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
  135. d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
  136. d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
  137. r20 = lo128(d[0]) & 0xfffffffffff;
  138. c = shr128(d[0], 44);
  139. d[1] = add128_64(d[1], c);
  140. r21 = lo128(d[1]) & 0xfffffffffff;
  141. c = shr128(d[1], 44);
  142. d[2] = add128_64(d[2], c);
  143. r22 = lo128(d[2]) & 0x3ffffffffff;
  144. c = shr128(d[2], 42);
  145. r20 += c * 5;
  146. c = (r20 >> 44);
  147. r20 = r20 & 0xfffffffffff;
  148. r21 += c;
  149. p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
  150. _MM_SHUFFLE(1, 0, 1, 0));
  151. p->R21.v = _mm_shuffle_epi32(
  152. _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
  153. _MM_SHUFFLE(1, 0, 1, 0));
  154. p->R22.v =
  155. _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
  156. _MM_SHUFFLE(1, 0, 1, 0));
  157. p->R23.v = _mm_shuffle_epi32(
  158. _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
  159. _MM_SHUFFLE(1, 0, 1, 0));
  160. p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
  161. _MM_SHUFFLE(1, 0, 1, 0));
  162. p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
  163. p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
  164. p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
  165. p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
  166. p--;
  167. }
  168. // put saved info back
  169. p = &st->P[1];
  170. p->R20.d[1] = (uint32_t)(r0);
  171. p->R20.d[3] = (uint32_t)(r0 >> 32);
  172. p->R21.d[1] = (uint32_t)(r1);
  173. p->R21.d[3] = (uint32_t)(r1 >> 32);
  174. p->R22.d[1] = (uint32_t)(r2);
  175. p->R22.d[3] = (uint32_t)(r2 >> 32);
  176. p->R23.d[1] = (uint32_t)(pad0);
  177. p->R23.d[3] = (uint32_t)(pad0 >> 32);
  178. p->R24.d[1] = (uint32_t)(pad1);
  179. p->R24.d[3] = (uint32_t)(pad1 >> 32);
  180. // H = [Mx,My]
  181. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  182. _mm_loadl_epi64((const xmmi *)(m + 16)));
  183. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  184. _mm_loadl_epi64((const xmmi *)(m + 24)));
  185. st->H[0] = _mm_and_si128(MMASK, T5);
  186. st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  187. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  188. st->H[2] = _mm_and_si128(MMASK, T5);
  189. st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  190. st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  191. }
  192. static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
  193. size_t bytes) {
  194. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  195. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  196. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  197. poly1305_power *p;
  198. xmmi H0, H1, H2, H3, H4;
  199. xmmi T0, T1, T2, T3, T4, T5, T6;
  200. xmmi M0, M1, M2, M3, M4;
  201. xmmi C1, C2;
  202. H0 = st->H[0];
  203. H1 = st->H[1];
  204. H2 = st->H[2];
  205. H3 = st->H[3];
  206. H4 = st->H[4];
  207. while (bytes >= 64) {
  208. // H *= [r^4,r^4]
  209. p = &st->P[0];
  210. T0 = _mm_mul_epu32(H0, p->R20.v);
  211. T1 = _mm_mul_epu32(H0, p->R21.v);
  212. T2 = _mm_mul_epu32(H0, p->R22.v);
  213. T3 = _mm_mul_epu32(H0, p->R23.v);
  214. T4 = _mm_mul_epu32(H0, p->R24.v);
  215. T5 = _mm_mul_epu32(H1, p->S24.v);
  216. T6 = _mm_mul_epu32(H1, p->R20.v);
  217. T0 = _mm_add_epi64(T0, T5);
  218. T1 = _mm_add_epi64(T1, T6);
  219. T5 = _mm_mul_epu32(H2, p->S23.v);
  220. T6 = _mm_mul_epu32(H2, p->S24.v);
  221. T0 = _mm_add_epi64(T0, T5);
  222. T1 = _mm_add_epi64(T1, T6);
  223. T5 = _mm_mul_epu32(H3, p->S22.v);
  224. T6 = _mm_mul_epu32(H3, p->S23.v);
  225. T0 = _mm_add_epi64(T0, T5);
  226. T1 = _mm_add_epi64(T1, T6);
  227. T5 = _mm_mul_epu32(H4, p->S21.v);
  228. T6 = _mm_mul_epu32(H4, p->S22.v);
  229. T0 = _mm_add_epi64(T0, T5);
  230. T1 = _mm_add_epi64(T1, T6);
  231. T5 = _mm_mul_epu32(H1, p->R21.v);
  232. T6 = _mm_mul_epu32(H1, p->R22.v);
  233. T2 = _mm_add_epi64(T2, T5);
  234. T3 = _mm_add_epi64(T3, T6);
  235. T5 = _mm_mul_epu32(H2, p->R20.v);
  236. T6 = _mm_mul_epu32(H2, p->R21.v);
  237. T2 = _mm_add_epi64(T2, T5);
  238. T3 = _mm_add_epi64(T3, T6);
  239. T5 = _mm_mul_epu32(H3, p->S24.v);
  240. T6 = _mm_mul_epu32(H3, p->R20.v);
  241. T2 = _mm_add_epi64(T2, T5);
  242. T3 = _mm_add_epi64(T3, T6);
  243. T5 = _mm_mul_epu32(H4, p->S23.v);
  244. T6 = _mm_mul_epu32(H4, p->S24.v);
  245. T2 = _mm_add_epi64(T2, T5);
  246. T3 = _mm_add_epi64(T3, T6);
  247. T5 = _mm_mul_epu32(H1, p->R23.v);
  248. T4 = _mm_add_epi64(T4, T5);
  249. T5 = _mm_mul_epu32(H2, p->R22.v);
  250. T4 = _mm_add_epi64(T4, T5);
  251. T5 = _mm_mul_epu32(H3, p->R21.v);
  252. T4 = _mm_add_epi64(T4, T5);
  253. T5 = _mm_mul_epu32(H4, p->R20.v);
  254. T4 = _mm_add_epi64(T4, T5);
  255. // H += [Mx,My]*[r^2,r^2]
  256. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  257. _mm_loadl_epi64((const xmmi *)(m + 16)));
  258. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  259. _mm_loadl_epi64((const xmmi *)(m + 24)));
  260. M0 = _mm_and_si128(MMASK, T5);
  261. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  262. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  263. M2 = _mm_and_si128(MMASK, T5);
  264. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  265. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  266. p = &st->P[1];
  267. T5 = _mm_mul_epu32(M0, p->R20.v);
  268. T6 = _mm_mul_epu32(M0, p->R21.v);
  269. T0 = _mm_add_epi64(T0, T5);
  270. T1 = _mm_add_epi64(T1, T6);
  271. T5 = _mm_mul_epu32(M1, p->S24.v);
  272. T6 = _mm_mul_epu32(M1, p->R20.v);
  273. T0 = _mm_add_epi64(T0, T5);
  274. T1 = _mm_add_epi64(T1, T6);
  275. T5 = _mm_mul_epu32(M2, p->S23.v);
  276. T6 = _mm_mul_epu32(M2, p->S24.v);
  277. T0 = _mm_add_epi64(T0, T5);
  278. T1 = _mm_add_epi64(T1, T6);
  279. T5 = _mm_mul_epu32(M3, p->S22.v);
  280. T6 = _mm_mul_epu32(M3, p->S23.v);
  281. T0 = _mm_add_epi64(T0, T5);
  282. T1 = _mm_add_epi64(T1, T6);
  283. T5 = _mm_mul_epu32(M4, p->S21.v);
  284. T6 = _mm_mul_epu32(M4, p->S22.v);
  285. T0 = _mm_add_epi64(T0, T5);
  286. T1 = _mm_add_epi64(T1, T6);
  287. T5 = _mm_mul_epu32(M0, p->R22.v);
  288. T6 = _mm_mul_epu32(M0, p->R23.v);
  289. T2 = _mm_add_epi64(T2, T5);
  290. T3 = _mm_add_epi64(T3, T6);
  291. T5 = _mm_mul_epu32(M1, p->R21.v);
  292. T6 = _mm_mul_epu32(M1, p->R22.v);
  293. T2 = _mm_add_epi64(T2, T5);
  294. T3 = _mm_add_epi64(T3, T6);
  295. T5 = _mm_mul_epu32(M2, p->R20.v);
  296. T6 = _mm_mul_epu32(M2, p->R21.v);
  297. T2 = _mm_add_epi64(T2, T5);
  298. T3 = _mm_add_epi64(T3, T6);
  299. T5 = _mm_mul_epu32(M3, p->S24.v);
  300. T6 = _mm_mul_epu32(M3, p->R20.v);
  301. T2 = _mm_add_epi64(T2, T5);
  302. T3 = _mm_add_epi64(T3, T6);
  303. T5 = _mm_mul_epu32(M4, p->S23.v);
  304. T6 = _mm_mul_epu32(M4, p->S24.v);
  305. T2 = _mm_add_epi64(T2, T5);
  306. T3 = _mm_add_epi64(T3, T6);
  307. T5 = _mm_mul_epu32(M0, p->R24.v);
  308. T4 = _mm_add_epi64(T4, T5);
  309. T5 = _mm_mul_epu32(M1, p->R23.v);
  310. T4 = _mm_add_epi64(T4, T5);
  311. T5 = _mm_mul_epu32(M2, p->R22.v);
  312. T4 = _mm_add_epi64(T4, T5);
  313. T5 = _mm_mul_epu32(M3, p->R21.v);
  314. T4 = _mm_add_epi64(T4, T5);
  315. T5 = _mm_mul_epu32(M4, p->R20.v);
  316. T4 = _mm_add_epi64(T4, T5);
  317. // H += [Mx,My]
  318. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
  319. _mm_loadl_epi64((const xmmi *)(m + 48)));
  320. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
  321. _mm_loadl_epi64((const xmmi *)(m + 56)));
  322. M0 = _mm_and_si128(MMASK, T5);
  323. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  324. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  325. M2 = _mm_and_si128(MMASK, T5);
  326. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  327. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  328. T0 = _mm_add_epi64(T0, M0);
  329. T1 = _mm_add_epi64(T1, M1);
  330. T2 = _mm_add_epi64(T2, M2);
  331. T3 = _mm_add_epi64(T3, M3);
  332. T4 = _mm_add_epi64(T4, M4);
  333. // reduce
  334. C1 = _mm_srli_epi64(T0, 26);
  335. C2 = _mm_srli_epi64(T3, 26);
  336. T0 = _mm_and_si128(T0, MMASK);
  337. T3 = _mm_and_si128(T3, MMASK);
  338. T1 = _mm_add_epi64(T1, C1);
  339. T4 = _mm_add_epi64(T4, C2);
  340. C1 = _mm_srli_epi64(T1, 26);
  341. C2 = _mm_srli_epi64(T4, 26);
  342. T1 = _mm_and_si128(T1, MMASK);
  343. T4 = _mm_and_si128(T4, MMASK);
  344. T2 = _mm_add_epi64(T2, C1);
  345. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  346. C1 = _mm_srli_epi64(T2, 26);
  347. C2 = _mm_srli_epi64(T0, 26);
  348. T2 = _mm_and_si128(T2, MMASK);
  349. T0 = _mm_and_si128(T0, MMASK);
  350. T3 = _mm_add_epi64(T3, C1);
  351. T1 = _mm_add_epi64(T1, C2);
  352. C1 = _mm_srli_epi64(T3, 26);
  353. T3 = _mm_and_si128(T3, MMASK);
  354. T4 = _mm_add_epi64(T4, C1);
  355. // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
  356. H0 = T0;
  357. H1 = T1;
  358. H2 = T2;
  359. H3 = T3;
  360. H4 = T4;
  361. m += 64;
  362. bytes -= 64;
  363. }
  364. st->H[0] = H0;
  365. st->H[1] = H1;
  366. st->H[2] = H2;
  367. st->H[3] = H3;
  368. st->H[4] = H4;
  369. }
  370. static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
  371. size_t bytes) {
  372. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  373. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  374. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  375. poly1305_power *p;
  376. xmmi H0, H1, H2, H3, H4;
  377. xmmi M0, M1, M2, M3, M4;
  378. xmmi T0, T1, T2, T3, T4, T5, T6;
  379. xmmi C1, C2;
  380. uint64_t r0, r1, r2;
  381. uint64_t t0, t1, t2, t3, t4;
  382. uint64_t c;
  383. size_t consumed = 0;
  384. H0 = st->H[0];
  385. H1 = st->H[1];
  386. H2 = st->H[2];
  387. H3 = st->H[3];
  388. H4 = st->H[4];
  389. // p = [r^2,r^2]
  390. p = &st->P[1];
  391. if (bytes >= 32) {
  392. // H *= [r^2,r^2]
  393. T0 = _mm_mul_epu32(H0, p->R20.v);
  394. T1 = _mm_mul_epu32(H0, p->R21.v);
  395. T2 = _mm_mul_epu32(H0, p->R22.v);
  396. T3 = _mm_mul_epu32(H0, p->R23.v);
  397. T4 = _mm_mul_epu32(H0, p->R24.v);
  398. T5 = _mm_mul_epu32(H1, p->S24.v);
  399. T6 = _mm_mul_epu32(H1, p->R20.v);
  400. T0 = _mm_add_epi64(T0, T5);
  401. T1 = _mm_add_epi64(T1, T6);
  402. T5 = _mm_mul_epu32(H2, p->S23.v);
  403. T6 = _mm_mul_epu32(H2, p->S24.v);
  404. T0 = _mm_add_epi64(T0, T5);
  405. T1 = _mm_add_epi64(T1, T6);
  406. T5 = _mm_mul_epu32(H3, p->S22.v);
  407. T6 = _mm_mul_epu32(H3, p->S23.v);
  408. T0 = _mm_add_epi64(T0, T5);
  409. T1 = _mm_add_epi64(T1, T6);
  410. T5 = _mm_mul_epu32(H4, p->S21.v);
  411. T6 = _mm_mul_epu32(H4, p->S22.v);
  412. T0 = _mm_add_epi64(T0, T5);
  413. T1 = _mm_add_epi64(T1, T6);
  414. T5 = _mm_mul_epu32(H1, p->R21.v);
  415. T6 = _mm_mul_epu32(H1, p->R22.v);
  416. T2 = _mm_add_epi64(T2, T5);
  417. T3 = _mm_add_epi64(T3, T6);
  418. T5 = _mm_mul_epu32(H2, p->R20.v);
  419. T6 = _mm_mul_epu32(H2, p->R21.v);
  420. T2 = _mm_add_epi64(T2, T5);
  421. T3 = _mm_add_epi64(T3, T6);
  422. T5 = _mm_mul_epu32(H3, p->S24.v);
  423. T6 = _mm_mul_epu32(H3, p->R20.v);
  424. T2 = _mm_add_epi64(T2, T5);
  425. T3 = _mm_add_epi64(T3, T6);
  426. T5 = _mm_mul_epu32(H4, p->S23.v);
  427. T6 = _mm_mul_epu32(H4, p->S24.v);
  428. T2 = _mm_add_epi64(T2, T5);
  429. T3 = _mm_add_epi64(T3, T6);
  430. T5 = _mm_mul_epu32(H1, p->R23.v);
  431. T4 = _mm_add_epi64(T4, T5);
  432. T5 = _mm_mul_epu32(H2, p->R22.v);
  433. T4 = _mm_add_epi64(T4, T5);
  434. T5 = _mm_mul_epu32(H3, p->R21.v);
  435. T4 = _mm_add_epi64(T4, T5);
  436. T5 = _mm_mul_epu32(H4, p->R20.v);
  437. T4 = _mm_add_epi64(T4, T5);
  438. // H += [Mx,My]
  439. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  440. _mm_loadl_epi64((const xmmi *)(m + 16)));
  441. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  442. _mm_loadl_epi64((const xmmi *)(m + 24)));
  443. M0 = _mm_and_si128(MMASK, T5);
  444. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  445. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  446. M2 = _mm_and_si128(MMASK, T5);
  447. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  448. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  449. T0 = _mm_add_epi64(T0, M0);
  450. T1 = _mm_add_epi64(T1, M1);
  451. T2 = _mm_add_epi64(T2, M2);
  452. T3 = _mm_add_epi64(T3, M3);
  453. T4 = _mm_add_epi64(T4, M4);
  454. // reduce
  455. C1 = _mm_srli_epi64(T0, 26);
  456. C2 = _mm_srli_epi64(T3, 26);
  457. T0 = _mm_and_si128(T0, MMASK);
  458. T3 = _mm_and_si128(T3, MMASK);
  459. T1 = _mm_add_epi64(T1, C1);
  460. T4 = _mm_add_epi64(T4, C2);
  461. C1 = _mm_srli_epi64(T1, 26);
  462. C2 = _mm_srli_epi64(T4, 26);
  463. T1 = _mm_and_si128(T1, MMASK);
  464. T4 = _mm_and_si128(T4, MMASK);
  465. T2 = _mm_add_epi64(T2, C1);
  466. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  467. C1 = _mm_srli_epi64(T2, 26);
  468. C2 = _mm_srli_epi64(T0, 26);
  469. T2 = _mm_and_si128(T2, MMASK);
  470. T0 = _mm_and_si128(T0, MMASK);
  471. T3 = _mm_add_epi64(T3, C1);
  472. T1 = _mm_add_epi64(T1, C2);
  473. C1 = _mm_srli_epi64(T3, 26);
  474. T3 = _mm_and_si128(T3, MMASK);
  475. T4 = _mm_add_epi64(T4, C1);
  476. // H = (H*[r^2,r^2] + [Mx,My])
  477. H0 = T0;
  478. H1 = T1;
  479. H2 = T2;
  480. H3 = T3;
  481. H4 = T4;
  482. consumed = 32;
  483. }
  484. // finalize, H *= [r^2,r]
  485. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  486. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  487. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  488. p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
  489. p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  490. p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  491. p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  492. p->R24.d[2] = (uint32_t)((r2 >> 16));
  493. p->S21.d[2] = p->R21.d[2] * 5;
  494. p->S22.d[2] = p->R22.d[2] * 5;
  495. p->S23.d[2] = p->R23.d[2] * 5;
  496. p->S24.d[2] = p->R24.d[2] * 5;
  497. // H *= [r^2,r]
  498. T0 = _mm_mul_epu32(H0, p->R20.v);
  499. T1 = _mm_mul_epu32(H0, p->R21.v);
  500. T2 = _mm_mul_epu32(H0, p->R22.v);
  501. T3 = _mm_mul_epu32(H0, p->R23.v);
  502. T4 = _mm_mul_epu32(H0, p->R24.v);
  503. T5 = _mm_mul_epu32(H1, p->S24.v);
  504. T6 = _mm_mul_epu32(H1, p->R20.v);
  505. T0 = _mm_add_epi64(T0, T5);
  506. T1 = _mm_add_epi64(T1, T6);
  507. T5 = _mm_mul_epu32(H2, p->S23.v);
  508. T6 = _mm_mul_epu32(H2, p->S24.v);
  509. T0 = _mm_add_epi64(T0, T5);
  510. T1 = _mm_add_epi64(T1, T6);
  511. T5 = _mm_mul_epu32(H3, p->S22.v);
  512. T6 = _mm_mul_epu32(H3, p->S23.v);
  513. T0 = _mm_add_epi64(T0, T5);
  514. T1 = _mm_add_epi64(T1, T6);
  515. T5 = _mm_mul_epu32(H4, p->S21.v);
  516. T6 = _mm_mul_epu32(H4, p->S22.v);
  517. T0 = _mm_add_epi64(T0, T5);
  518. T1 = _mm_add_epi64(T1, T6);
  519. T5 = _mm_mul_epu32(H1, p->R21.v);
  520. T6 = _mm_mul_epu32(H1, p->R22.v);
  521. T2 = _mm_add_epi64(T2, T5);
  522. T3 = _mm_add_epi64(T3, T6);
  523. T5 = _mm_mul_epu32(H2, p->R20.v);
  524. T6 = _mm_mul_epu32(H2, p->R21.v);
  525. T2 = _mm_add_epi64(T2, T5);
  526. T3 = _mm_add_epi64(T3, T6);
  527. T5 = _mm_mul_epu32(H3, p->S24.v);
  528. T6 = _mm_mul_epu32(H3, p->R20.v);
  529. T2 = _mm_add_epi64(T2, T5);
  530. T3 = _mm_add_epi64(T3, T6);
  531. T5 = _mm_mul_epu32(H4, p->S23.v);
  532. T6 = _mm_mul_epu32(H4, p->S24.v);
  533. T2 = _mm_add_epi64(T2, T5);
  534. T3 = _mm_add_epi64(T3, T6);
  535. T5 = _mm_mul_epu32(H1, p->R23.v);
  536. T4 = _mm_add_epi64(T4, T5);
  537. T5 = _mm_mul_epu32(H2, p->R22.v);
  538. T4 = _mm_add_epi64(T4, T5);
  539. T5 = _mm_mul_epu32(H3, p->R21.v);
  540. T4 = _mm_add_epi64(T4, T5);
  541. T5 = _mm_mul_epu32(H4, p->R20.v);
  542. T4 = _mm_add_epi64(T4, T5);
  543. C1 = _mm_srli_epi64(T0, 26);
  544. C2 = _mm_srli_epi64(T3, 26);
  545. T0 = _mm_and_si128(T0, MMASK);
  546. T3 = _mm_and_si128(T3, MMASK);
  547. T1 = _mm_add_epi64(T1, C1);
  548. T4 = _mm_add_epi64(T4, C2);
  549. C1 = _mm_srli_epi64(T1, 26);
  550. C2 = _mm_srli_epi64(T4, 26);
  551. T1 = _mm_and_si128(T1, MMASK);
  552. T4 = _mm_and_si128(T4, MMASK);
  553. T2 = _mm_add_epi64(T2, C1);
  554. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  555. C1 = _mm_srli_epi64(T2, 26);
  556. C2 = _mm_srli_epi64(T0, 26);
  557. T2 = _mm_and_si128(T2, MMASK);
  558. T0 = _mm_and_si128(T0, MMASK);
  559. T3 = _mm_add_epi64(T3, C1);
  560. T1 = _mm_add_epi64(T1, C2);
  561. C1 = _mm_srli_epi64(T3, 26);
  562. T3 = _mm_and_si128(T3, MMASK);
  563. T4 = _mm_add_epi64(T4, C1);
  564. // H = H[0]+H[1]
  565. H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  566. H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  567. H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  568. H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  569. H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
  570. t0 = _mm_cvtsi128_si32(H0);
  571. c = (t0 >> 26);
  572. t0 &= 0x3ffffff;
  573. t1 = _mm_cvtsi128_si32(H1) + c;
  574. c = (t1 >> 26);
  575. t1 &= 0x3ffffff;
  576. t2 = _mm_cvtsi128_si32(H2) + c;
  577. c = (t2 >> 26);
  578. t2 &= 0x3ffffff;
  579. t3 = _mm_cvtsi128_si32(H3) + c;
  580. c = (t3 >> 26);
  581. t3 &= 0x3ffffff;
  582. t4 = _mm_cvtsi128_si32(H4) + c;
  583. c = (t4 >> 26);
  584. t4 &= 0x3ffffff;
  585. t0 = t0 + (c * 5);
  586. c = (t0 >> 26);
  587. t0 &= 0x3ffffff;
  588. t1 = t1 + c;
  589. st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
  590. st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
  591. st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
  592. return consumed;
  593. }
  594. void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
  595. size_t bytes) {
  596. poly1305_state_internal *st = poly1305_aligned_state(state);
  597. size_t want;
  598. // need at least 32 initial bytes to start the accelerated branch
  599. if (!st->started) {
  600. if ((st->leftover == 0) && (bytes > 32)) {
  601. poly1305_first_block(st, m);
  602. m += 32;
  603. bytes -= 32;
  604. } else {
  605. want = poly1305_min(32 - st->leftover, bytes);
  606. OPENSSL_memcpy(st->buffer + st->leftover, m, want);
  607. bytes -= want;
  608. m += want;
  609. st->leftover += want;
  610. if ((st->leftover < 32) || (bytes == 0)) {
  611. return;
  612. }
  613. poly1305_first_block(st, st->buffer);
  614. st->leftover = 0;
  615. }
  616. st->started = 1;
  617. }
  618. // handle leftover
  619. if (st->leftover) {
  620. want = poly1305_min(64 - st->leftover, bytes);
  621. OPENSSL_memcpy(st->buffer + st->leftover, m, want);
  622. bytes -= want;
  623. m += want;
  624. st->leftover += want;
  625. if (st->leftover < 64) {
  626. return;
  627. }
  628. poly1305_blocks(st, st->buffer, 64);
  629. st->leftover = 0;
  630. }
  631. // process 64 byte blocks
  632. if (bytes >= 64) {
  633. want = (bytes & ~63);
  634. poly1305_blocks(st, m, want);
  635. m += want;
  636. bytes -= want;
  637. }
  638. if (bytes) {
  639. OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
  640. st->leftover += bytes;
  641. }
  642. }
  643. void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
  644. poly1305_state_internal *st = poly1305_aligned_state(state);
  645. size_t leftover = st->leftover;
  646. uint8_t *m = st->buffer;
  647. uint128_t d[3];
  648. uint64_t h0, h1, h2;
  649. uint64_t t0, t1;
  650. uint64_t g0, g1, g2, c, nc;
  651. uint64_t r0, r1, r2, s1, s2;
  652. poly1305_power *p;
  653. if (st->started) {
  654. size_t consumed = poly1305_combine(st, m, leftover);
  655. leftover -= consumed;
  656. m += consumed;
  657. }
  658. // st->HH will either be 0 or have the combined result
  659. h0 = st->HH[0];
  660. h1 = st->HH[1];
  661. h2 = st->HH[2];
  662. p = &st->P[1];
  663. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  664. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  665. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  666. s1 = r1 * (5 << 2);
  667. s2 = r2 * (5 << 2);
  668. if (leftover < 16) {
  669. goto poly1305_donna_atmost15bytes;
  670. }
  671. poly1305_donna_atleast16bytes:
  672. t0 = U8TO64_LE(m + 0);
  673. t1 = U8TO64_LE(m + 8);
  674. h0 += t0 & 0xfffffffffff;
  675. t0 = shr128_pair(t1, t0, 44);
  676. h1 += t0 & 0xfffffffffff;
  677. h2 += (t1 >> 24) | ((uint64_t)1 << 40);
  678. poly1305_donna_mul:
  679. d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
  680. mul64x64_128(h2, s1));
  681. d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
  682. mul64x64_128(h2, s2));
  683. d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
  684. mul64x64_128(h2, r0));
  685. h0 = lo128(d[0]) & 0xfffffffffff;
  686. c = shr128(d[0], 44);
  687. d[1] = add128_64(d[1], c);
  688. h1 = lo128(d[1]) & 0xfffffffffff;
  689. c = shr128(d[1], 44);
  690. d[2] = add128_64(d[2], c);
  691. h2 = lo128(d[2]) & 0x3ffffffffff;
  692. c = shr128(d[2], 42);
  693. h0 += c * 5;
  694. m += 16;
  695. leftover -= 16;
  696. if (leftover >= 16) {
  697. goto poly1305_donna_atleast16bytes;
  698. }
  699. // final bytes
  700. poly1305_donna_atmost15bytes:
  701. if (!leftover) {
  702. goto poly1305_donna_finish;
  703. }
  704. m[leftover++] = 1;
  705. OPENSSL_memset(m + leftover, 0, 16 - leftover);
  706. leftover = 16;
  707. t0 = U8TO64_LE(m + 0);
  708. t1 = U8TO64_LE(m + 8);
  709. h0 += t0 & 0xfffffffffff;
  710. t0 = shr128_pair(t1, t0, 44);
  711. h1 += t0 & 0xfffffffffff;
  712. h2 += (t1 >> 24);
  713. goto poly1305_donna_mul;
  714. poly1305_donna_finish:
  715. c = (h0 >> 44);
  716. h0 &= 0xfffffffffff;
  717. h1 += c;
  718. c = (h1 >> 44);
  719. h1 &= 0xfffffffffff;
  720. h2 += c;
  721. c = (h2 >> 42);
  722. h2 &= 0x3ffffffffff;
  723. h0 += c * 5;
  724. g0 = h0 + 5;
  725. c = (g0 >> 44);
  726. g0 &= 0xfffffffffff;
  727. g1 = h1 + c;
  728. c = (g1 >> 44);
  729. g1 &= 0xfffffffffff;
  730. g2 = h2 + c - ((uint64_t)1 << 42);
  731. c = (g2 >> 63) - 1;
  732. nc = ~c;
  733. h0 = (h0 & nc) | (g0 & c);
  734. h1 = (h1 & nc) | (g1 & c);
  735. h2 = (h2 & nc) | (g2 & c);
  736. // pad
  737. t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  738. t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  739. h0 += (t0 & 0xfffffffffff);
  740. c = (h0 >> 44);
  741. h0 &= 0xfffffffffff;
  742. t0 = shr128_pair(t1, t0, 44);
  743. h1 += (t0 & 0xfffffffffff) + c;
  744. c = (h1 >> 44);
  745. h1 &= 0xfffffffffff;
  746. t1 = (t1 >> 24);
  747. h2 += (t1)+c;
  748. U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
  749. U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
  750. }
  751. #endif // !OPENSSL_WINDOWS && OPENSSL_X86_64