選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。
 
 
 
 
 
 

891 行
27 KiB

  1. /* Copyright (c) 2014, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. // This implementation of poly1305 is by Andrew Moon
  15. // (https://github.com/floodyberry/poly1305-donna) and released as public
  16. // domain. It implements SIMD vectorization based on the algorithm described in
  17. // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
  18. // block size
  19. #include <openssl/poly1305.h>
  20. #include "../internal.h"
  21. #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
  22. #include <emmintrin.h>
  23. #define U8TO64_LE(m) (*(const uint64_t *)(m))
  24. #define U8TO32_LE(m) (*(const uint32_t *)(m))
  25. #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
  26. typedef __m128i xmmi;
  27. static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
  28. (1 << 26) - 1, 0, (1 << 26) - 1, 0};
  29. static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
  30. static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
  31. (1 << 24), 0, (1 << 24), 0};
  32. static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
  33. static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
  34. static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
  35. return (uint128_t)a * b;
  36. }
  37. static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
  38. static inline uint64_t shr128(uint128_t v, const int shift) {
  39. return (uint64_t)(v >> shift);
  40. }
  41. static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
  42. return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
  43. }
  44. typedef struct poly1305_power_t {
  45. union {
  46. xmmi v;
  47. uint64_t u[2];
  48. uint32_t d[4];
  49. } R20, R21, R22, R23, R24, S21, S22, S23, S24;
  50. } poly1305_power;
  51. typedef struct poly1305_state_internal_t {
  52. poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
  53. bytes of free storage */
  54. union {
  55. xmmi H[5]; // 80 bytes
  56. uint64_t HH[10];
  57. };
  58. // uint64_t r0,r1,r2; [24 bytes]
  59. // uint64_t pad0,pad1; [16 bytes]
  60. uint64_t started; // 8 bytes
  61. uint64_t leftover; // 8 bytes
  62. uint8_t buffer[64]; // 64 bytes
  63. } poly1305_state_internal; /* 448 bytes total + 63 bytes for
  64. alignment = 511 bytes raw */
  65. static inline poly1305_state_internal *poly1305_aligned_state(
  66. poly1305_state *state) {
  67. return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
  68. }
  69. // copy 0-63 bytes
  70. static inline void
  71. poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
  72. size_t offset = src - dst;
  73. if (bytes & 32) {
  74. _mm_storeu_si128((xmmi *)(dst + 0),
  75. _mm_loadu_si128((const xmmi *)(dst + offset + 0)));
  76. _mm_storeu_si128((xmmi *)(dst + 16),
  77. _mm_loadu_si128((const xmmi *)(dst + offset + 16)));
  78. dst += 32;
  79. }
  80. if (bytes & 16) {
  81. _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((const xmmi *)(dst + offset)));
  82. dst += 16;
  83. }
  84. if (bytes & 8) {
  85. *(uint64_t *)dst = *(const uint64_t *)(dst + offset);
  86. dst += 8;
  87. }
  88. if (bytes & 4) {
  89. *(uint32_t *)dst = *(const uint32_t *)(dst + offset);
  90. dst += 4;
  91. }
  92. if (bytes & 2) {
  93. *(uint16_t *)dst = *(uint16_t *)(dst + offset);
  94. dst += 2;
  95. }
  96. if (bytes & 1) {
  97. *(uint8_t *)dst = *(uint8_t *)(dst + offset);
  98. }
  99. }
  100. // zero 0-15 bytes
  101. static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) {
  102. if (bytes & 8) {
  103. *(uint64_t *)dst = 0;
  104. dst += 8;
  105. }
  106. if (bytes & 4) {
  107. *(uint32_t *)dst = 0;
  108. dst += 4;
  109. }
  110. if (bytes & 2) {
  111. *(uint16_t *)dst = 0;
  112. dst += 2;
  113. }
  114. if (bytes & 1) {
  115. *(uint8_t *)dst = 0;
  116. }
  117. }
  118. static inline size_t poly1305_min(size_t a, size_t b) {
  119. return (a < b) ? a : b;
  120. }
  121. void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
  122. poly1305_state_internal *st = poly1305_aligned_state(state);
  123. poly1305_power *p;
  124. uint64_t r0, r1, r2;
  125. uint64_t t0, t1;
  126. // clamp key
  127. t0 = U8TO64_LE(key + 0);
  128. t1 = U8TO64_LE(key + 8);
  129. r0 = t0 & 0xffc0fffffff;
  130. t0 >>= 44;
  131. t0 |= t1 << 20;
  132. r1 = t0 & 0xfffffc0ffff;
  133. t1 >>= 24;
  134. r2 = t1 & 0x00ffffffc0f;
  135. // store r in un-used space of st->P[1]
  136. p = &st->P[1];
  137. p->R20.d[1] = (uint32_t)(r0);
  138. p->R20.d[3] = (uint32_t)(r0 >> 32);
  139. p->R21.d[1] = (uint32_t)(r1);
  140. p->R21.d[3] = (uint32_t)(r1 >> 32);
  141. p->R22.d[1] = (uint32_t)(r2);
  142. p->R22.d[3] = (uint32_t)(r2 >> 32);
  143. // store pad
  144. p->R23.d[1] = U8TO32_LE(key + 16);
  145. p->R23.d[3] = U8TO32_LE(key + 20);
  146. p->R24.d[1] = U8TO32_LE(key + 24);
  147. p->R24.d[3] = U8TO32_LE(key + 28);
  148. // H = 0
  149. st->H[0] = _mm_setzero_si128();
  150. st->H[1] = _mm_setzero_si128();
  151. st->H[2] = _mm_setzero_si128();
  152. st->H[3] = _mm_setzero_si128();
  153. st->H[4] = _mm_setzero_si128();
  154. st->started = 0;
  155. st->leftover = 0;
  156. }
  157. static void poly1305_first_block(poly1305_state_internal *st,
  158. const uint8_t *m) {
  159. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  160. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  161. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  162. xmmi T5, T6;
  163. poly1305_power *p;
  164. uint128_t d[3];
  165. uint64_t r0, r1, r2;
  166. uint64_t r20, r21, r22, s22;
  167. uint64_t pad0, pad1;
  168. uint64_t c;
  169. uint64_t i;
  170. // pull out stored info
  171. p = &st->P[1];
  172. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  173. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  174. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  175. pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  176. pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  177. // compute powers r^2,r^4
  178. r20 = r0;
  179. r21 = r1;
  180. r22 = r2;
  181. for (i = 0; i < 2; i++) {
  182. s22 = r22 * (5 << 2);
  183. d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
  184. d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
  185. d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
  186. r20 = lo128(d[0]) & 0xfffffffffff;
  187. c = shr128(d[0], 44);
  188. d[1] = add128_64(d[1], c);
  189. r21 = lo128(d[1]) & 0xfffffffffff;
  190. c = shr128(d[1], 44);
  191. d[2] = add128_64(d[2], c);
  192. r22 = lo128(d[2]) & 0x3ffffffffff;
  193. c = shr128(d[2], 42);
  194. r20 += c * 5;
  195. c = (r20 >> 44);
  196. r20 = r20 & 0xfffffffffff;
  197. r21 += c;
  198. p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
  199. _MM_SHUFFLE(1, 0, 1, 0));
  200. p->R21.v = _mm_shuffle_epi32(
  201. _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
  202. _MM_SHUFFLE(1, 0, 1, 0));
  203. p->R22.v =
  204. _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
  205. _MM_SHUFFLE(1, 0, 1, 0));
  206. p->R23.v = _mm_shuffle_epi32(
  207. _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
  208. _MM_SHUFFLE(1, 0, 1, 0));
  209. p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
  210. _MM_SHUFFLE(1, 0, 1, 0));
  211. p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
  212. p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
  213. p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
  214. p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
  215. p--;
  216. }
  217. // put saved info back
  218. p = &st->P[1];
  219. p->R20.d[1] = (uint32_t)(r0);
  220. p->R20.d[3] = (uint32_t)(r0 >> 32);
  221. p->R21.d[1] = (uint32_t)(r1);
  222. p->R21.d[3] = (uint32_t)(r1 >> 32);
  223. p->R22.d[1] = (uint32_t)(r2);
  224. p->R22.d[3] = (uint32_t)(r2 >> 32);
  225. p->R23.d[1] = (uint32_t)(pad0);
  226. p->R23.d[3] = (uint32_t)(pad0 >> 32);
  227. p->R24.d[1] = (uint32_t)(pad1);
  228. p->R24.d[3] = (uint32_t)(pad1 >> 32);
  229. // H = [Mx,My]
  230. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  231. _mm_loadl_epi64((const xmmi *)(m + 16)));
  232. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  233. _mm_loadl_epi64((const xmmi *)(m + 24)));
  234. st->H[0] = _mm_and_si128(MMASK, T5);
  235. st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  236. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  237. st->H[2] = _mm_and_si128(MMASK, T5);
  238. st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  239. st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  240. }
  241. static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
  242. size_t bytes) {
  243. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  244. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  245. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  246. poly1305_power *p;
  247. xmmi H0, H1, H2, H3, H4;
  248. xmmi T0, T1, T2, T3, T4, T5, T6;
  249. xmmi M0, M1, M2, M3, M4;
  250. xmmi C1, C2;
  251. H0 = st->H[0];
  252. H1 = st->H[1];
  253. H2 = st->H[2];
  254. H3 = st->H[3];
  255. H4 = st->H[4];
  256. while (bytes >= 64) {
  257. // H *= [r^4,r^4]
  258. p = &st->P[0];
  259. T0 = _mm_mul_epu32(H0, p->R20.v);
  260. T1 = _mm_mul_epu32(H0, p->R21.v);
  261. T2 = _mm_mul_epu32(H0, p->R22.v);
  262. T3 = _mm_mul_epu32(H0, p->R23.v);
  263. T4 = _mm_mul_epu32(H0, p->R24.v);
  264. T5 = _mm_mul_epu32(H1, p->S24.v);
  265. T6 = _mm_mul_epu32(H1, p->R20.v);
  266. T0 = _mm_add_epi64(T0, T5);
  267. T1 = _mm_add_epi64(T1, T6);
  268. T5 = _mm_mul_epu32(H2, p->S23.v);
  269. T6 = _mm_mul_epu32(H2, p->S24.v);
  270. T0 = _mm_add_epi64(T0, T5);
  271. T1 = _mm_add_epi64(T1, T6);
  272. T5 = _mm_mul_epu32(H3, p->S22.v);
  273. T6 = _mm_mul_epu32(H3, p->S23.v);
  274. T0 = _mm_add_epi64(T0, T5);
  275. T1 = _mm_add_epi64(T1, T6);
  276. T5 = _mm_mul_epu32(H4, p->S21.v);
  277. T6 = _mm_mul_epu32(H4, p->S22.v);
  278. T0 = _mm_add_epi64(T0, T5);
  279. T1 = _mm_add_epi64(T1, T6);
  280. T5 = _mm_mul_epu32(H1, p->R21.v);
  281. T6 = _mm_mul_epu32(H1, p->R22.v);
  282. T2 = _mm_add_epi64(T2, T5);
  283. T3 = _mm_add_epi64(T3, T6);
  284. T5 = _mm_mul_epu32(H2, p->R20.v);
  285. T6 = _mm_mul_epu32(H2, p->R21.v);
  286. T2 = _mm_add_epi64(T2, T5);
  287. T3 = _mm_add_epi64(T3, T6);
  288. T5 = _mm_mul_epu32(H3, p->S24.v);
  289. T6 = _mm_mul_epu32(H3, p->R20.v);
  290. T2 = _mm_add_epi64(T2, T5);
  291. T3 = _mm_add_epi64(T3, T6);
  292. T5 = _mm_mul_epu32(H4, p->S23.v);
  293. T6 = _mm_mul_epu32(H4, p->S24.v);
  294. T2 = _mm_add_epi64(T2, T5);
  295. T3 = _mm_add_epi64(T3, T6);
  296. T5 = _mm_mul_epu32(H1, p->R23.v);
  297. T4 = _mm_add_epi64(T4, T5);
  298. T5 = _mm_mul_epu32(H2, p->R22.v);
  299. T4 = _mm_add_epi64(T4, T5);
  300. T5 = _mm_mul_epu32(H3, p->R21.v);
  301. T4 = _mm_add_epi64(T4, T5);
  302. T5 = _mm_mul_epu32(H4, p->R20.v);
  303. T4 = _mm_add_epi64(T4, T5);
  304. // H += [Mx,My]*[r^2,r^2]
  305. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  306. _mm_loadl_epi64((const xmmi *)(m + 16)));
  307. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  308. _mm_loadl_epi64((const xmmi *)(m + 24)));
  309. M0 = _mm_and_si128(MMASK, T5);
  310. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  311. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  312. M2 = _mm_and_si128(MMASK, T5);
  313. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  314. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  315. p = &st->P[1];
  316. T5 = _mm_mul_epu32(M0, p->R20.v);
  317. T6 = _mm_mul_epu32(M0, p->R21.v);
  318. T0 = _mm_add_epi64(T0, T5);
  319. T1 = _mm_add_epi64(T1, T6);
  320. T5 = _mm_mul_epu32(M1, p->S24.v);
  321. T6 = _mm_mul_epu32(M1, p->R20.v);
  322. T0 = _mm_add_epi64(T0, T5);
  323. T1 = _mm_add_epi64(T1, T6);
  324. T5 = _mm_mul_epu32(M2, p->S23.v);
  325. T6 = _mm_mul_epu32(M2, p->S24.v);
  326. T0 = _mm_add_epi64(T0, T5);
  327. T1 = _mm_add_epi64(T1, T6);
  328. T5 = _mm_mul_epu32(M3, p->S22.v);
  329. T6 = _mm_mul_epu32(M3, p->S23.v);
  330. T0 = _mm_add_epi64(T0, T5);
  331. T1 = _mm_add_epi64(T1, T6);
  332. T5 = _mm_mul_epu32(M4, p->S21.v);
  333. T6 = _mm_mul_epu32(M4, p->S22.v);
  334. T0 = _mm_add_epi64(T0, T5);
  335. T1 = _mm_add_epi64(T1, T6);
  336. T5 = _mm_mul_epu32(M0, p->R22.v);
  337. T6 = _mm_mul_epu32(M0, p->R23.v);
  338. T2 = _mm_add_epi64(T2, T5);
  339. T3 = _mm_add_epi64(T3, T6);
  340. T5 = _mm_mul_epu32(M1, p->R21.v);
  341. T6 = _mm_mul_epu32(M1, p->R22.v);
  342. T2 = _mm_add_epi64(T2, T5);
  343. T3 = _mm_add_epi64(T3, T6);
  344. T5 = _mm_mul_epu32(M2, p->R20.v);
  345. T6 = _mm_mul_epu32(M2, p->R21.v);
  346. T2 = _mm_add_epi64(T2, T5);
  347. T3 = _mm_add_epi64(T3, T6);
  348. T5 = _mm_mul_epu32(M3, p->S24.v);
  349. T6 = _mm_mul_epu32(M3, p->R20.v);
  350. T2 = _mm_add_epi64(T2, T5);
  351. T3 = _mm_add_epi64(T3, T6);
  352. T5 = _mm_mul_epu32(M4, p->S23.v);
  353. T6 = _mm_mul_epu32(M4, p->S24.v);
  354. T2 = _mm_add_epi64(T2, T5);
  355. T3 = _mm_add_epi64(T3, T6);
  356. T5 = _mm_mul_epu32(M0, p->R24.v);
  357. T4 = _mm_add_epi64(T4, T5);
  358. T5 = _mm_mul_epu32(M1, p->R23.v);
  359. T4 = _mm_add_epi64(T4, T5);
  360. T5 = _mm_mul_epu32(M2, p->R22.v);
  361. T4 = _mm_add_epi64(T4, T5);
  362. T5 = _mm_mul_epu32(M3, p->R21.v);
  363. T4 = _mm_add_epi64(T4, T5);
  364. T5 = _mm_mul_epu32(M4, p->R20.v);
  365. T4 = _mm_add_epi64(T4, T5);
  366. // H += [Mx,My]
  367. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
  368. _mm_loadl_epi64((const xmmi *)(m + 48)));
  369. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
  370. _mm_loadl_epi64((const xmmi *)(m + 56)));
  371. M0 = _mm_and_si128(MMASK, T5);
  372. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  373. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  374. M2 = _mm_and_si128(MMASK, T5);
  375. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  376. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  377. T0 = _mm_add_epi64(T0, M0);
  378. T1 = _mm_add_epi64(T1, M1);
  379. T2 = _mm_add_epi64(T2, M2);
  380. T3 = _mm_add_epi64(T3, M3);
  381. T4 = _mm_add_epi64(T4, M4);
  382. // reduce
  383. C1 = _mm_srli_epi64(T0, 26);
  384. C2 = _mm_srli_epi64(T3, 26);
  385. T0 = _mm_and_si128(T0, MMASK);
  386. T3 = _mm_and_si128(T3, MMASK);
  387. T1 = _mm_add_epi64(T1, C1);
  388. T4 = _mm_add_epi64(T4, C2);
  389. C1 = _mm_srli_epi64(T1, 26);
  390. C2 = _mm_srli_epi64(T4, 26);
  391. T1 = _mm_and_si128(T1, MMASK);
  392. T4 = _mm_and_si128(T4, MMASK);
  393. T2 = _mm_add_epi64(T2, C1);
  394. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  395. C1 = _mm_srli_epi64(T2, 26);
  396. C2 = _mm_srli_epi64(T0, 26);
  397. T2 = _mm_and_si128(T2, MMASK);
  398. T0 = _mm_and_si128(T0, MMASK);
  399. T3 = _mm_add_epi64(T3, C1);
  400. T1 = _mm_add_epi64(T1, C2);
  401. C1 = _mm_srli_epi64(T3, 26);
  402. T3 = _mm_and_si128(T3, MMASK);
  403. T4 = _mm_add_epi64(T4, C1);
  404. // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
  405. H0 = T0;
  406. H1 = T1;
  407. H2 = T2;
  408. H3 = T3;
  409. H4 = T4;
  410. m += 64;
  411. bytes -= 64;
  412. }
  413. st->H[0] = H0;
  414. st->H[1] = H1;
  415. st->H[2] = H2;
  416. st->H[3] = H3;
  417. st->H[4] = H4;
  418. }
  419. static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
  420. size_t bytes) {
  421. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  422. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  423. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  424. poly1305_power *p;
  425. xmmi H0, H1, H2, H3, H4;
  426. xmmi M0, M1, M2, M3, M4;
  427. xmmi T0, T1, T2, T3, T4, T5, T6;
  428. xmmi C1, C2;
  429. uint64_t r0, r1, r2;
  430. uint64_t t0, t1, t2, t3, t4;
  431. uint64_t c;
  432. size_t consumed = 0;
  433. H0 = st->H[0];
  434. H1 = st->H[1];
  435. H2 = st->H[2];
  436. H3 = st->H[3];
  437. H4 = st->H[4];
  438. // p = [r^2,r^2]
  439. p = &st->P[1];
  440. if (bytes >= 32) {
  441. // H *= [r^2,r^2]
  442. T0 = _mm_mul_epu32(H0, p->R20.v);
  443. T1 = _mm_mul_epu32(H0, p->R21.v);
  444. T2 = _mm_mul_epu32(H0, p->R22.v);
  445. T3 = _mm_mul_epu32(H0, p->R23.v);
  446. T4 = _mm_mul_epu32(H0, p->R24.v);
  447. T5 = _mm_mul_epu32(H1, p->S24.v);
  448. T6 = _mm_mul_epu32(H1, p->R20.v);
  449. T0 = _mm_add_epi64(T0, T5);
  450. T1 = _mm_add_epi64(T1, T6);
  451. T5 = _mm_mul_epu32(H2, p->S23.v);
  452. T6 = _mm_mul_epu32(H2, p->S24.v);
  453. T0 = _mm_add_epi64(T0, T5);
  454. T1 = _mm_add_epi64(T1, T6);
  455. T5 = _mm_mul_epu32(H3, p->S22.v);
  456. T6 = _mm_mul_epu32(H3, p->S23.v);
  457. T0 = _mm_add_epi64(T0, T5);
  458. T1 = _mm_add_epi64(T1, T6);
  459. T5 = _mm_mul_epu32(H4, p->S21.v);
  460. T6 = _mm_mul_epu32(H4, p->S22.v);
  461. T0 = _mm_add_epi64(T0, T5);
  462. T1 = _mm_add_epi64(T1, T6);
  463. T5 = _mm_mul_epu32(H1, p->R21.v);
  464. T6 = _mm_mul_epu32(H1, p->R22.v);
  465. T2 = _mm_add_epi64(T2, T5);
  466. T3 = _mm_add_epi64(T3, T6);
  467. T5 = _mm_mul_epu32(H2, p->R20.v);
  468. T6 = _mm_mul_epu32(H2, p->R21.v);
  469. T2 = _mm_add_epi64(T2, T5);
  470. T3 = _mm_add_epi64(T3, T6);
  471. T5 = _mm_mul_epu32(H3, p->S24.v);
  472. T6 = _mm_mul_epu32(H3, p->R20.v);
  473. T2 = _mm_add_epi64(T2, T5);
  474. T3 = _mm_add_epi64(T3, T6);
  475. T5 = _mm_mul_epu32(H4, p->S23.v);
  476. T6 = _mm_mul_epu32(H4, p->S24.v);
  477. T2 = _mm_add_epi64(T2, T5);
  478. T3 = _mm_add_epi64(T3, T6);
  479. T5 = _mm_mul_epu32(H1, p->R23.v);
  480. T4 = _mm_add_epi64(T4, T5);
  481. T5 = _mm_mul_epu32(H2, p->R22.v);
  482. T4 = _mm_add_epi64(T4, T5);
  483. T5 = _mm_mul_epu32(H3, p->R21.v);
  484. T4 = _mm_add_epi64(T4, T5);
  485. T5 = _mm_mul_epu32(H4, p->R20.v);
  486. T4 = _mm_add_epi64(T4, T5);
  487. // H += [Mx,My]
  488. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  489. _mm_loadl_epi64((const xmmi *)(m + 16)));
  490. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  491. _mm_loadl_epi64((const xmmi *)(m + 24)));
  492. M0 = _mm_and_si128(MMASK, T5);
  493. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  494. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  495. M2 = _mm_and_si128(MMASK, T5);
  496. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  497. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  498. T0 = _mm_add_epi64(T0, M0);
  499. T1 = _mm_add_epi64(T1, M1);
  500. T2 = _mm_add_epi64(T2, M2);
  501. T3 = _mm_add_epi64(T3, M3);
  502. T4 = _mm_add_epi64(T4, M4);
  503. // reduce
  504. C1 = _mm_srli_epi64(T0, 26);
  505. C2 = _mm_srli_epi64(T3, 26);
  506. T0 = _mm_and_si128(T0, MMASK);
  507. T3 = _mm_and_si128(T3, MMASK);
  508. T1 = _mm_add_epi64(T1, C1);
  509. T4 = _mm_add_epi64(T4, C2);
  510. C1 = _mm_srli_epi64(T1, 26);
  511. C2 = _mm_srli_epi64(T4, 26);
  512. T1 = _mm_and_si128(T1, MMASK);
  513. T4 = _mm_and_si128(T4, MMASK);
  514. T2 = _mm_add_epi64(T2, C1);
  515. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  516. C1 = _mm_srli_epi64(T2, 26);
  517. C2 = _mm_srli_epi64(T0, 26);
  518. T2 = _mm_and_si128(T2, MMASK);
  519. T0 = _mm_and_si128(T0, MMASK);
  520. T3 = _mm_add_epi64(T3, C1);
  521. T1 = _mm_add_epi64(T1, C2);
  522. C1 = _mm_srli_epi64(T3, 26);
  523. T3 = _mm_and_si128(T3, MMASK);
  524. T4 = _mm_add_epi64(T4, C1);
  525. // H = (H*[r^2,r^2] + [Mx,My])
  526. H0 = T0;
  527. H1 = T1;
  528. H2 = T2;
  529. H3 = T3;
  530. H4 = T4;
  531. consumed = 32;
  532. }
  533. // finalize, H *= [r^2,r]
  534. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  535. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  536. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  537. p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
  538. p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  539. p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  540. p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  541. p->R24.d[2] = (uint32_t)((r2 >> 16));
  542. p->S21.d[2] = p->R21.d[2] * 5;
  543. p->S22.d[2] = p->R22.d[2] * 5;
  544. p->S23.d[2] = p->R23.d[2] * 5;
  545. p->S24.d[2] = p->R24.d[2] * 5;
  546. // H *= [r^2,r]
  547. T0 = _mm_mul_epu32(H0, p->R20.v);
  548. T1 = _mm_mul_epu32(H0, p->R21.v);
  549. T2 = _mm_mul_epu32(H0, p->R22.v);
  550. T3 = _mm_mul_epu32(H0, p->R23.v);
  551. T4 = _mm_mul_epu32(H0, p->R24.v);
  552. T5 = _mm_mul_epu32(H1, p->S24.v);
  553. T6 = _mm_mul_epu32(H1, p->R20.v);
  554. T0 = _mm_add_epi64(T0, T5);
  555. T1 = _mm_add_epi64(T1, T6);
  556. T5 = _mm_mul_epu32(H2, p->S23.v);
  557. T6 = _mm_mul_epu32(H2, p->S24.v);
  558. T0 = _mm_add_epi64(T0, T5);
  559. T1 = _mm_add_epi64(T1, T6);
  560. T5 = _mm_mul_epu32(H3, p->S22.v);
  561. T6 = _mm_mul_epu32(H3, p->S23.v);
  562. T0 = _mm_add_epi64(T0, T5);
  563. T1 = _mm_add_epi64(T1, T6);
  564. T5 = _mm_mul_epu32(H4, p->S21.v);
  565. T6 = _mm_mul_epu32(H4, p->S22.v);
  566. T0 = _mm_add_epi64(T0, T5);
  567. T1 = _mm_add_epi64(T1, T6);
  568. T5 = _mm_mul_epu32(H1, p->R21.v);
  569. T6 = _mm_mul_epu32(H1, p->R22.v);
  570. T2 = _mm_add_epi64(T2, T5);
  571. T3 = _mm_add_epi64(T3, T6);
  572. T5 = _mm_mul_epu32(H2, p->R20.v);
  573. T6 = _mm_mul_epu32(H2, p->R21.v);
  574. T2 = _mm_add_epi64(T2, T5);
  575. T3 = _mm_add_epi64(T3, T6);
  576. T5 = _mm_mul_epu32(H3, p->S24.v);
  577. T6 = _mm_mul_epu32(H3, p->R20.v);
  578. T2 = _mm_add_epi64(T2, T5);
  579. T3 = _mm_add_epi64(T3, T6);
  580. T5 = _mm_mul_epu32(H4, p->S23.v);
  581. T6 = _mm_mul_epu32(H4, p->S24.v);
  582. T2 = _mm_add_epi64(T2, T5);
  583. T3 = _mm_add_epi64(T3, T6);
  584. T5 = _mm_mul_epu32(H1, p->R23.v);
  585. T4 = _mm_add_epi64(T4, T5);
  586. T5 = _mm_mul_epu32(H2, p->R22.v);
  587. T4 = _mm_add_epi64(T4, T5);
  588. T5 = _mm_mul_epu32(H3, p->R21.v);
  589. T4 = _mm_add_epi64(T4, T5);
  590. T5 = _mm_mul_epu32(H4, p->R20.v);
  591. T4 = _mm_add_epi64(T4, T5);
  592. C1 = _mm_srli_epi64(T0, 26);
  593. C2 = _mm_srli_epi64(T3, 26);
  594. T0 = _mm_and_si128(T0, MMASK);
  595. T3 = _mm_and_si128(T3, MMASK);
  596. T1 = _mm_add_epi64(T1, C1);
  597. T4 = _mm_add_epi64(T4, C2);
  598. C1 = _mm_srli_epi64(T1, 26);
  599. C2 = _mm_srli_epi64(T4, 26);
  600. T1 = _mm_and_si128(T1, MMASK);
  601. T4 = _mm_and_si128(T4, MMASK);
  602. T2 = _mm_add_epi64(T2, C1);
  603. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  604. C1 = _mm_srli_epi64(T2, 26);
  605. C2 = _mm_srli_epi64(T0, 26);
  606. T2 = _mm_and_si128(T2, MMASK);
  607. T0 = _mm_and_si128(T0, MMASK);
  608. T3 = _mm_add_epi64(T3, C1);
  609. T1 = _mm_add_epi64(T1, C2);
  610. C1 = _mm_srli_epi64(T3, 26);
  611. T3 = _mm_and_si128(T3, MMASK);
  612. T4 = _mm_add_epi64(T4, C1);
  613. // H = H[0]+H[1]
  614. H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  615. H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  616. H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  617. H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  618. H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
  619. t0 = _mm_cvtsi128_si32(H0);
  620. c = (t0 >> 26);
  621. t0 &= 0x3ffffff;
  622. t1 = _mm_cvtsi128_si32(H1) + c;
  623. c = (t1 >> 26);
  624. t1 &= 0x3ffffff;
  625. t2 = _mm_cvtsi128_si32(H2) + c;
  626. c = (t2 >> 26);
  627. t2 &= 0x3ffffff;
  628. t3 = _mm_cvtsi128_si32(H3) + c;
  629. c = (t3 >> 26);
  630. t3 &= 0x3ffffff;
  631. t4 = _mm_cvtsi128_si32(H4) + c;
  632. c = (t4 >> 26);
  633. t4 &= 0x3ffffff;
  634. t0 = t0 + (c * 5);
  635. c = (t0 >> 26);
  636. t0 &= 0x3ffffff;
  637. t1 = t1 + c;
  638. st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
  639. st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
  640. st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
  641. return consumed;
  642. }
  643. void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
  644. size_t bytes) {
  645. poly1305_state_internal *st = poly1305_aligned_state(state);
  646. size_t want;
  647. // need at least 32 initial bytes to start the accelerated branch
  648. if (!st->started) {
  649. if ((st->leftover == 0) && (bytes > 32)) {
  650. poly1305_first_block(st, m);
  651. m += 32;
  652. bytes -= 32;
  653. } else {
  654. want = poly1305_min(32 - st->leftover, bytes);
  655. poly1305_block_copy(st->buffer + st->leftover, m, want);
  656. bytes -= want;
  657. m += want;
  658. st->leftover += want;
  659. if ((st->leftover < 32) || (bytes == 0)) {
  660. return;
  661. }
  662. poly1305_first_block(st, st->buffer);
  663. st->leftover = 0;
  664. }
  665. st->started = 1;
  666. }
  667. // handle leftover
  668. if (st->leftover) {
  669. want = poly1305_min(64 - st->leftover, bytes);
  670. poly1305_block_copy(st->buffer + st->leftover, m, want);
  671. bytes -= want;
  672. m += want;
  673. st->leftover += want;
  674. if (st->leftover < 64) {
  675. return;
  676. }
  677. poly1305_blocks(st, st->buffer, 64);
  678. st->leftover = 0;
  679. }
  680. // process 64 byte blocks
  681. if (bytes >= 64) {
  682. want = (bytes & ~63);
  683. poly1305_blocks(st, m, want);
  684. m += want;
  685. bytes -= want;
  686. }
  687. if (bytes) {
  688. poly1305_block_copy(st->buffer + st->leftover, m, bytes);
  689. st->leftover += bytes;
  690. }
  691. }
  692. void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
  693. poly1305_state_internal *st = poly1305_aligned_state(state);
  694. size_t leftover = st->leftover;
  695. uint8_t *m = st->buffer;
  696. uint128_t d[3];
  697. uint64_t h0, h1, h2;
  698. uint64_t t0, t1;
  699. uint64_t g0, g1, g2, c, nc;
  700. uint64_t r0, r1, r2, s1, s2;
  701. poly1305_power *p;
  702. if (st->started) {
  703. size_t consumed = poly1305_combine(st, m, leftover);
  704. leftover -= consumed;
  705. m += consumed;
  706. }
  707. // st->HH will either be 0 or have the combined result
  708. h0 = st->HH[0];
  709. h1 = st->HH[1];
  710. h2 = st->HH[2];
  711. p = &st->P[1];
  712. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  713. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  714. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  715. s1 = r1 * (5 << 2);
  716. s2 = r2 * (5 << 2);
  717. if (leftover < 16) {
  718. goto poly1305_donna_atmost15bytes;
  719. }
  720. poly1305_donna_atleast16bytes:
  721. t0 = U8TO64_LE(m + 0);
  722. t1 = U8TO64_LE(m + 8);
  723. h0 += t0 & 0xfffffffffff;
  724. t0 = shr128_pair(t1, t0, 44);
  725. h1 += t0 & 0xfffffffffff;
  726. h2 += (t1 >> 24) | ((uint64_t)1 << 40);
  727. poly1305_donna_mul:
  728. d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
  729. mul64x64_128(h2, s1));
  730. d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
  731. mul64x64_128(h2, s2));
  732. d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
  733. mul64x64_128(h2, r0));
  734. h0 = lo128(d[0]) & 0xfffffffffff;
  735. c = shr128(d[0], 44);
  736. d[1] = add128_64(d[1], c);
  737. h1 = lo128(d[1]) & 0xfffffffffff;
  738. c = shr128(d[1], 44);
  739. d[2] = add128_64(d[2], c);
  740. h2 = lo128(d[2]) & 0x3ffffffffff;
  741. c = shr128(d[2], 42);
  742. h0 += c * 5;
  743. m += 16;
  744. leftover -= 16;
  745. if (leftover >= 16) {
  746. goto poly1305_donna_atleast16bytes;
  747. }
  748. // final bytes
  749. poly1305_donna_atmost15bytes:
  750. if (!leftover) {
  751. goto poly1305_donna_finish;
  752. }
  753. m[leftover++] = 1;
  754. poly1305_block_zero(m + leftover, 16 - leftover);
  755. leftover = 16;
  756. t0 = U8TO64_LE(m + 0);
  757. t1 = U8TO64_LE(m + 8);
  758. h0 += t0 & 0xfffffffffff;
  759. t0 = shr128_pair(t1, t0, 44);
  760. h1 += t0 & 0xfffffffffff;
  761. h2 += (t1 >> 24);
  762. goto poly1305_donna_mul;
  763. poly1305_donna_finish:
  764. c = (h0 >> 44);
  765. h0 &= 0xfffffffffff;
  766. h1 += c;
  767. c = (h1 >> 44);
  768. h1 &= 0xfffffffffff;
  769. h2 += c;
  770. c = (h2 >> 42);
  771. h2 &= 0x3ffffffffff;
  772. h0 += c * 5;
  773. g0 = h0 + 5;
  774. c = (g0 >> 44);
  775. g0 &= 0xfffffffffff;
  776. g1 = h1 + c;
  777. c = (g1 >> 44);
  778. g1 &= 0xfffffffffff;
  779. g2 = h2 + c - ((uint64_t)1 << 42);
  780. c = (g2 >> 63) - 1;
  781. nc = ~c;
  782. h0 = (h0 & nc) | (g0 & c);
  783. h1 = (h1 & nc) | (g1 & c);
  784. h2 = (h2 & nc) | (g2 & c);
  785. // pad
  786. t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  787. t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  788. h0 += (t0 & 0xfffffffffff);
  789. c = (h0 >> 44);
  790. h0 &= 0xfffffffffff;
  791. t0 = shr128_pair(t1, t0, 44);
  792. h1 += (t0 & 0xfffffffffff) + c;
  793. c = (h1 >> 44);
  794. h1 &= 0xfffffffffff;
  795. t1 = (t1 >> 24);
  796. h2 += (t1)+c;
  797. U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
  798. U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
  799. }
  800. #endif // !OPENSSL_WINDOWS && OPENSSL_X86_64