You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
преди 10 години
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. /* Copyright (c) 2014, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. /* ====================================================================
  15. *
  16. * When updating this file, also update chacha_vec_arm.S
  17. *
  18. * ==================================================================== */
  19. /* This implementation is by Ted Krovetz and was submitted to SUPERCOP and
  20. * marked as public domain. It was been altered to allow for non-aligned inputs
  21. * and to allow the block counter to be passed in specifically. */
  22. #include <openssl/chacha.h>
  23. #if defined(ASM_GEN) || \
  24. !defined(OPENSSL_WINDOWS) && \
  25. (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) && defined(__SSE2__)
  26. #define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */
  27. /* Architecture-neutral way to specify 16-byte vector of ints */
  28. typedef unsigned vec __attribute__((vector_size(16)));
  29. /* This implementation is designed for Neon, SSE and AltiVec machines. The
  30. * following specify how to do certain vector operations efficiently on
  31. * each architecture, using intrinsics.
  32. * This implementation supports parallel processing of multiple blocks,
  33. * including potentially using general-purpose registers. */
  34. #if __ARM_NEON__
  35. #include <string.h>
  36. #include <arm_neon.h>
  37. #define GPR_TOO 1
  38. #define VBPI 2
  39. #define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0)
  40. #define LOAD_ALIGNED(m) (vec)(*((vec *)(m)))
  41. #define LOAD(m) ({ \
  42. memcpy(alignment_buffer, m, 16); \
  43. LOAD_ALIGNED(alignment_buffer); \
  44. })
  45. #define STORE(m, r) ({ \
  46. (*((vec *)(alignment_buffer))) = (r); \
  47. memcpy(m, alignment_buffer, 16); \
  48. })
  49. #define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1)
  50. #define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2)
  51. #define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3)
  52. #define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x)
  53. #if __clang__
  54. #define ROTW7(x) (x << ((vec) {7, 7, 7, 7})) ^ (x >> ((vec) {25, 25, 25, 25}))
  55. #define ROTW8(x) (x << ((vec) {8, 8, 8, 8})) ^ (x >> ((vec) {24, 24, 24, 24}))
  56. #define ROTW12(x) \
  57. (x << ((vec) {12, 12, 12, 12})) ^ (x >> ((vec) {20, 20, 20, 20}))
  58. #else
  59. #define ROTW7(x) \
  60. (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25)
  61. #define ROTW8(x) \
  62. (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24)
  63. #define ROTW12(x) \
  64. (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20)
  65. #endif
  66. #elif __SSE2__
  67. #include <emmintrin.h>
  68. #define GPR_TOO 0
  69. #if __clang__
  70. #define VBPI 4
  71. #else
  72. #define VBPI 3
  73. #endif
  74. #define ONE (vec) _mm_set_epi32(0, 0, 0, 1)
  75. #define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m))
  76. #define LOAD_ALIGNED(m) (vec) _mm_load_si128((__m128i *)(m))
  77. #define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r))
  78. #define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1))
  79. #define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2))
  80. #define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3))
  81. #define ROTW7(x) \
  82. (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25))
  83. #define ROTW12(x) \
  84. (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20))
  85. #if __SSSE3__
  86. #include <tmmintrin.h>
  87. #define ROTW8(x) \
  88. (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, \
  89. 11, 6, 5, 4, 7, 2, 1, 0, 3))
  90. #define ROTW16(x) \
  91. (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \
  92. 10, 5, 4, 7, 6, 1, 0, 3, 2))
  93. #else
  94. #define ROTW8(x) \
  95. (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24))
  96. #define ROTW16(x) \
  97. (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16))
  98. #endif
  99. #else
  100. #error-- Implementation supports only machines with neon or SSE2
  101. #endif
  102. #ifndef REVV_BE
  103. #define REVV_BE(x) (x)
  104. #endif
  105. #ifndef REVW_BE
  106. #define REVW_BE(x) (x)
  107. #endif
  108. #define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration */
  109. #define DQROUND_VECTORS(a,b,c,d) \
  110. a += b; d ^= a; d = ROTW16(d); \
  111. c += d; b ^= c; b = ROTW12(b); \
  112. a += b; d ^= a; d = ROTW8(d); \
  113. c += d; b ^= c; b = ROTW7(b); \
  114. b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); \
  115. a += b; d ^= a; d = ROTW16(d); \
  116. c += d; b ^= c; b = ROTW12(b); \
  117. a += b; d ^= a; d = ROTW8(d); \
  118. c += d; b ^= c; b = ROTW7(b); \
  119. b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
  120. #define QROUND_WORDS(a,b,c,d) \
  121. a = a+b; d ^= a; d = d<<16 | d>>16; \
  122. c = c+d; b ^= c; b = b<<12 | b>>20; \
  123. a = a+b; d ^= a; d = d<< 8 | d>>24; \
  124. c = c+d; b ^= c; b = b<< 7 | b>>25;
  125. #define WRITE_XOR(in, op, d, v0, v1, v2, v3) \
  126. STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \
  127. STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \
  128. STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \
  129. STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
  130. #if __ARM_NEON__
  131. /* For ARM, we can't depend on NEON support, so this function is compiled with
  132. * a different name, along with the generic code, and can be enabled at
  133. * run-time. */
  134. void CRYPTO_chacha_20_neon(
  135. #else
  136. void CRYPTO_chacha_20(
  137. #endif
  138. uint8_t *out,
  139. const uint8_t *in,
  140. size_t inlen,
  141. const uint8_t key[32],
  142. const uint8_t nonce[12],
  143. uint32_t counter)
  144. {
  145. unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp;
  146. #if defined(__ARM_NEON__)
  147. uint32_t np[3];
  148. uint8_t alignment_buffer[16] __attribute__((aligned(16)));
  149. #endif
  150. vec s0, s1, s2, s3;
  151. __attribute__ ((aligned (16))) unsigned chacha_const[] =
  152. {0x61707865,0x3320646E,0x79622D32,0x6B206574};
  153. kp = (unsigned *)key;
  154. #if defined(__ARM_NEON__)
  155. memcpy(np, nonce, 12);
  156. #endif
  157. s0 = LOAD_ALIGNED(chacha_const);
  158. s1 = LOAD(&((vec*)kp)[0]);
  159. s2 = LOAD(&((vec*)kp)[1]);
  160. s3 = (vec){
  161. counter,
  162. ((uint32_t*)nonce)[0],
  163. ((uint32_t*)nonce)[1],
  164. ((uint32_t*)nonce)[2]
  165. };
  166. for (iters = 0; iters < inlen/(BPI*64); iters++)
  167. {
  168. #if GPR_TOO
  169. register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8,
  170. x9, x10, x11, x12, x13, x14, x15;
  171. #endif
  172. #if VBPI > 2
  173. vec v8,v9,v10,v11;
  174. #endif
  175. #if VBPI > 3
  176. vec v12,v13,v14,v15;
  177. #endif
  178. vec v0,v1,v2,v3,v4,v5,v6,v7;
  179. v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3;
  180. v7 = v3 + ONE;
  181. #if VBPI > 2
  182. v8 = v4; v9 = v5; v10 = v6;
  183. v11 = v7 + ONE;
  184. #endif
  185. #if VBPI > 3
  186. v12 = v8; v13 = v9; v14 = v10;
  187. v15 = v11 + ONE;
  188. #endif
  189. #if GPR_TOO
  190. x0 = chacha_const[0]; x1 = chacha_const[1];
  191. x2 = chacha_const[2]; x3 = chacha_const[3];
  192. x4 = kp[0]; x5 = kp[1]; x6 = kp[2]; x7 = kp[3];
  193. x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7];
  194. x12 = counter+BPI*iters+(BPI-1); x13 = np[0];
  195. x14 = np[1]; x15 = np[2];
  196. #endif
  197. for (i = CHACHA_RNDS/2; i; i--)
  198. {
  199. DQROUND_VECTORS(v0,v1,v2,v3)
  200. DQROUND_VECTORS(v4,v5,v6,v7)
  201. #if VBPI > 2
  202. DQROUND_VECTORS(v8,v9,v10,v11)
  203. #endif
  204. #if VBPI > 3
  205. DQROUND_VECTORS(v12,v13,v14,v15)
  206. #endif
  207. #if GPR_TOO
  208. QROUND_WORDS( x0, x4, x8,x12)
  209. QROUND_WORDS( x1, x5, x9,x13)
  210. QROUND_WORDS( x2, x6,x10,x14)
  211. QROUND_WORDS( x3, x7,x11,x15)
  212. QROUND_WORDS( x0, x5,x10,x15)
  213. QROUND_WORDS( x1, x6,x11,x12)
  214. QROUND_WORDS( x2, x7, x8,x13)
  215. QROUND_WORDS( x3, x4, x9,x14)
  216. #endif
  217. }
  218. WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
  219. s3 += ONE;
  220. WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3)
  221. s3 += ONE;
  222. #if VBPI > 2
  223. WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3)
  224. s3 += ONE;
  225. #endif
  226. #if VBPI > 3
  227. WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3)
  228. s3 += ONE;
  229. #endif
  230. ip += VBPI*16;
  231. op += VBPI*16;
  232. #if GPR_TOO
  233. op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0]));
  234. op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1]));
  235. op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2]));
  236. op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3]));
  237. op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0]));
  238. op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1]));
  239. op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2]));
  240. op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3]));
  241. op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4]));
  242. op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5]));
  243. op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));
  244. op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));
  245. op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1)));
  246. op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13 + np[0]));
  247. op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[1]));
  248. op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[2]));
  249. s3 += ONE;
  250. ip += 16;
  251. op += 16;
  252. #endif
  253. }
  254. for (iters = inlen%(BPI*64)/64; iters != 0; iters--)
  255. {
  256. vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;
  257. for (i = CHACHA_RNDS/2; i; i--)
  258. {
  259. DQROUND_VECTORS(v0,v1,v2,v3);
  260. }
  261. WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
  262. s3 += ONE;
  263. ip += 16;
  264. op += 16;
  265. }
  266. inlen = inlen % 64;
  267. if (inlen)
  268. {
  269. __attribute__ ((aligned (16))) vec buf[4];
  270. vec v0,v1,v2,v3;
  271. v0 = s0; v1 = s1; v2 = s2; v3 = s3;
  272. for (i = CHACHA_RNDS/2; i; i--)
  273. {
  274. DQROUND_VECTORS(v0,v1,v2,v3);
  275. }
  276. if (inlen >= 16)
  277. {
  278. STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));
  279. if (inlen >= 32)
  280. {
  281. STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));
  282. if (inlen >= 48)
  283. {
  284. STORE(op + 8, LOAD(ip + 8) ^
  285. REVV_BE(v2 + s2));
  286. buf[3] = REVV_BE(v3 + s3);
  287. }
  288. else
  289. buf[2] = REVV_BE(v2 + s2);
  290. }
  291. else
  292. buf[1] = REVV_BE(v1 + s1);
  293. }
  294. else
  295. buf[0] = REVV_BE(v0 + s0);
  296. for (i=inlen & ~15; i<inlen; i++)
  297. ((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i];
  298. }
  299. }
  300. #endif /* ASM_GEN || !OPENSSL_WINDOWS && (OPENSSL_X86_64 || OPENSSL_X86) && SSE2 */