Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

10 лет назад
10 лет назад
10 лет назад
10 лет назад
10 лет назад
10 лет назад
10 лет назад
10 лет назад
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /* Copyright (c) 2014, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. /* ====================================================================
  15. *
  16. * When updating this file, also update chacha_vec_arm.S
  17. *
  18. * ==================================================================== */
  19. /* This implementation is by Ted Krovetz and was submitted to SUPERCOP and
  20. * marked as public domain. It was been altered to allow for non-aligned inputs
  21. * and to allow the block counter to be passed in specifically. */
  22. #include <openssl/chacha.h>
  23. #if defined(ASM_GEN) || \
  24. !defined(OPENSSL_WINDOWS) && \
  25. (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) && defined(__SSE2__)
  26. #define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */
  27. /* Architecture-neutral way to specify 16-byte vector of ints */
  28. typedef unsigned vec __attribute__((vector_size(16)));
  29. /* This implementation is designed for Neon, SSE and AltiVec machines. The
  30. * following specify how to do certain vector operations efficiently on
  31. * each architecture, using intrinsics.
  32. * This implementation supports parallel processing of multiple blocks,
  33. * including potentially using general-purpose registers. */
  34. #if __ARM_NEON__
  35. #include <string.h>
  36. #include <arm_neon.h>
  37. #define GPR_TOO 1
  38. #define VBPI 2
  39. #define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0)
  40. #define LOAD_ALIGNED(m) (vec)(*((vec *)(m)))
  41. #define LOAD(m) ({ \
  42. memcpy(alignment_buffer, m, 16); \
  43. LOAD_ALIGNED(alignment_buffer); \
  44. })
  45. #define STORE(m, r) ({ \
  46. (*((vec *)(alignment_buffer))) = (r); \
  47. memcpy(m, alignment_buffer, 16); \
  48. })
  49. #define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1)
  50. #define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2)
  51. #define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3)
  52. #define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x)
  53. #if __clang__
  54. #define ROTW7(x) (x << ((vec) {7, 7, 7, 7})) ^ (x >> ((vec) {25, 25, 25, 25}))
  55. #define ROTW8(x) (x << ((vec) {8, 8, 8, 8})) ^ (x >> ((vec) {24, 24, 24, 24}))
  56. #define ROTW12(x) \
  57. (x << ((vec) {12, 12, 12, 12})) ^ (x >> ((vec) {20, 20, 20, 20}))
  58. #else
  59. #define ROTW7(x) \
  60. (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25)
  61. #define ROTW8(x) \
  62. (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24)
  63. #define ROTW12(x) \
  64. (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20)
  65. #endif
  66. #elif __SSE2__
  67. #include <emmintrin.h>
  68. #define GPR_TOO 0
  69. #if __clang__
  70. #define VBPI 4
  71. #else
  72. #define VBPI 3
  73. #endif
  74. #define ONE (vec) _mm_set_epi32(0, 0, 0, 1)
  75. #define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m))
  76. #define LOAD_ALIGNED(m) (vec) _mm_load_si128((__m128i *)(m))
  77. #define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r))
  78. #define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1))
  79. #define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2))
  80. #define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3))
  81. #define ROTW7(x) \
  82. (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25))
  83. #define ROTW12(x) \
  84. (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20))
  85. #if __SSSE3__
  86. #include <tmmintrin.h>
  87. #define ROTW8(x) \
  88. (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, \
  89. 11, 6, 5, 4, 7, 2, 1, 0, 3))
  90. #define ROTW16(x) \
  91. (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \
  92. 10, 5, 4, 7, 6, 1, 0, 3, 2))
  93. #else
  94. #define ROTW8(x) \
  95. (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24))
  96. #define ROTW16(x) \
  97. (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16))
  98. #endif
  99. #else
  100. #error-- Implementation supports only machines with neon or SSE2
  101. #endif
  102. #ifndef REVV_BE
  103. #define REVV_BE(x) (x)
  104. #endif
  105. #ifndef REVW_BE
  106. #define REVW_BE(x) (x)
  107. #endif
  108. #define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration */
  109. #define DQROUND_VECTORS(a,b,c,d) \
  110. a += b; d ^= a; d = ROTW16(d); \
  111. c += d; b ^= c; b = ROTW12(b); \
  112. a += b; d ^= a; d = ROTW8(d); \
  113. c += d; b ^= c; b = ROTW7(b); \
  114. b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); \
  115. a += b; d ^= a; d = ROTW16(d); \
  116. c += d; b ^= c; b = ROTW12(b); \
  117. a += b; d ^= a; d = ROTW8(d); \
  118. c += d; b ^= c; b = ROTW7(b); \
  119. b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
  120. #define QROUND_WORDS(a,b,c,d) \
  121. a = a+b; d ^= a; d = d<<16 | d>>16; \
  122. c = c+d; b ^= c; b = b<<12 | b>>20; \
  123. a = a+b; d ^= a; d = d<< 8 | d>>24; \
  124. c = c+d; b ^= c; b = b<< 7 | b>>25;
  125. #define WRITE_XOR(in, op, d, v0, v1, v2, v3) \
  126. STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \
  127. STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \
  128. STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \
  129. STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
  130. #if __ARM_NEON__
  131. /* For ARM, we can't depend on NEON support, so this function is compiled with
  132. * a different name, along with the generic code, and can be enabled at
  133. * run-time. */
  134. void CRYPTO_chacha_20_neon(
  135. #else
  136. void CRYPTO_chacha_20(
  137. #endif
  138. uint8_t *out,
  139. const uint8_t *in,
  140. size_t inlen,
  141. const uint8_t key[32],
  142. const uint8_t nonce[8],
  143. size_t counter)
  144. {
  145. unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp;
  146. #if defined(__ARM_NEON__)
  147. uint32_t np[2];
  148. uint8_t alignment_buffer[16] __attribute__((aligned(16)));
  149. #endif
  150. vec s0, s1, s2, s3;
  151. __attribute__ ((aligned (16))) unsigned chacha_const[] =
  152. {0x61707865,0x3320646E,0x79622D32,0x6B206574};
  153. kp = (unsigned *)key;
  154. #if defined(__ARM_NEON__)
  155. memcpy(np, nonce, 8);
  156. #endif
  157. s0 = LOAD_ALIGNED(chacha_const);
  158. s1 = LOAD(&((vec*)kp)[0]);
  159. s2 = LOAD(&((vec*)kp)[1]);
  160. s3 = (vec){
  161. counter & 0xffffffff,
  162. #if __ARM_NEON__ || defined(OPENSSL_X86)
  163. 0, /* can't right-shift 32 bits on a 32-bit system. */
  164. #else
  165. counter >> 32,
  166. #endif
  167. ((uint32_t*)nonce)[0],
  168. ((uint32_t*)nonce)[1]
  169. };
  170. for (iters = 0; iters < inlen/(BPI*64); iters++)
  171. {
  172. #if GPR_TOO
  173. register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8,
  174. x9, x10, x11, x12, x13, x14, x15;
  175. #endif
  176. #if VBPI > 2
  177. vec v8,v9,v10,v11;
  178. #endif
  179. #if VBPI > 3
  180. vec v12,v13,v14,v15;
  181. #endif
  182. vec v0,v1,v2,v3,v4,v5,v6,v7;
  183. v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3;
  184. v7 = v3 + ONE;
  185. #if VBPI > 2
  186. v8 = v4; v9 = v5; v10 = v6;
  187. v11 = v7 + ONE;
  188. #endif
  189. #if VBPI > 3
  190. v12 = v8; v13 = v9; v14 = v10;
  191. v15 = v11 + ONE;
  192. #endif
  193. #if GPR_TOO
  194. x0 = chacha_const[0]; x1 = chacha_const[1];
  195. x2 = chacha_const[2]; x3 = chacha_const[3];
  196. x4 = kp[0]; x5 = kp[1]; x6 = kp[2]; x7 = kp[3];
  197. x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7];
  198. x12 = counter+BPI*iters+(BPI-1); x13 = 0;
  199. x14 = np[0]; x15 = np[1];
  200. #endif
  201. for (i = CHACHA_RNDS/2; i; i--)
  202. {
  203. DQROUND_VECTORS(v0,v1,v2,v3)
  204. DQROUND_VECTORS(v4,v5,v6,v7)
  205. #if VBPI > 2
  206. DQROUND_VECTORS(v8,v9,v10,v11)
  207. #endif
  208. #if VBPI > 3
  209. DQROUND_VECTORS(v12,v13,v14,v15)
  210. #endif
  211. #if GPR_TOO
  212. QROUND_WORDS( x0, x4, x8,x12)
  213. QROUND_WORDS( x1, x5, x9,x13)
  214. QROUND_WORDS( x2, x6,x10,x14)
  215. QROUND_WORDS( x3, x7,x11,x15)
  216. QROUND_WORDS( x0, x5,x10,x15)
  217. QROUND_WORDS( x1, x6,x11,x12)
  218. QROUND_WORDS( x2, x7, x8,x13)
  219. QROUND_WORDS( x3, x4, x9,x14)
  220. #endif
  221. }
  222. WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
  223. s3 += ONE;
  224. WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3)
  225. s3 += ONE;
  226. #if VBPI > 2
  227. WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3)
  228. s3 += ONE;
  229. #endif
  230. #if VBPI > 3
  231. WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3)
  232. s3 += ONE;
  233. #endif
  234. ip += VBPI*16;
  235. op += VBPI*16;
  236. #if GPR_TOO
  237. op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0]));
  238. op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1]));
  239. op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2]));
  240. op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3]));
  241. op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0]));
  242. op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1]));
  243. op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2]));
  244. op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3]));
  245. op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4]));
  246. op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5]));
  247. op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));
  248. op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));
  249. op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1)));
  250. op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13));
  251. op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0]));
  252. op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1]));
  253. s3 += ONE;
  254. ip += 16;
  255. op += 16;
  256. #endif
  257. }
  258. for (iters = inlen%(BPI*64)/64; iters != 0; iters--)
  259. {
  260. vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;
  261. for (i = CHACHA_RNDS/2; i; i--)
  262. {
  263. DQROUND_VECTORS(v0,v1,v2,v3);
  264. }
  265. WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
  266. s3 += ONE;
  267. ip += 16;
  268. op += 16;
  269. }
  270. inlen = inlen % 64;
  271. if (inlen)
  272. {
  273. __attribute__ ((aligned (16))) vec buf[4];
  274. vec v0,v1,v2,v3;
  275. v0 = s0; v1 = s1; v2 = s2; v3 = s3;
  276. for (i = CHACHA_RNDS/2; i; i--)
  277. {
  278. DQROUND_VECTORS(v0,v1,v2,v3);
  279. }
  280. if (inlen >= 16)
  281. {
  282. STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));
  283. if (inlen >= 32)
  284. {
  285. STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));
  286. if (inlen >= 48)
  287. {
  288. STORE(op + 8, LOAD(ip + 8) ^
  289. REVV_BE(v2 + s2));
  290. buf[3] = REVV_BE(v3 + s3);
  291. }
  292. else
  293. buf[2] = REVV_BE(v2 + s2);
  294. }
  295. else
  296. buf[1] = REVV_BE(v1 + s1);
  297. }
  298. else
  299. buf[0] = REVV_BE(v0 + s0);
  300. for (i=inlen & ~15; i<inlen; i++)
  301. ((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i];
  302. }
  303. }
  304. #endif /* ASM_GEN || !OPENSSL_WINDOWS && (OPENSSL_X86_64 || OPENSSL_X86) && SSE2 */