Browse Source

Optimized Seal/Open routines for ChaCha20-Poly1305 for x86-64

This is basically the same implementation I wrote for Go

The Go implementation:
https://github.com/golang/crypto/blob/master/chacha20poly1305/chacha20poly1305_amd64.s
The Cloudflare patch for OpenSSL:
https://github.com/cloudflare/sslconfig/blob/master/patches/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch

The Seal/Open is only available for the new version, the old one uses
the bundled Poly1305, and the existing ChaCha20 implementations

The benefits of this code, compared to the optimized code currently
disabled in BoringSSL:

* Passes test vectors
* Faster performance: The AVX2 code (on Haswell), is 55% faster for 16B,
  15% for 1350 and 6% for 8192 byte buffers
* Even faster on pre-AVX2 CPUs

Feel free to put whatever license, etc. is appropriate, under the
existing CLA.

Benchmarks are for 16/1350/8192 chunk sizes and given in MB/s:

Before (Ivy Bridge): 34.2   589.5  739.4
After:               68.4   692.1  799.4
Before (Skylake):    50    1233   1649
After:              119.4  1736   2196
After (Andy's):      63.6  1608   2261

Change-Id: I9186f721812655011fc17698b67ddbe8a1c7203b
Reviewed-on: https://boringssl-review.googlesource.com/13142
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
kris/onging/CECPQ3_patch15
vkrasnov 7 years ago
committed by Adam Langley
parent
commit
8d56558031
3 changed files with 2441 additions and 6 deletions
  1. +12
    -0
      crypto/cipher/CMakeLists.txt
  2. +2371
    -0
      crypto/cipher/asm/chacha20_poly1305_x86_64.pl
  3. +58
    -6
      crypto/cipher/e_chacha20poly1305.c

+ 12
- 0
crypto/cipher/CMakeLists.txt View File

@@ -1,5 +1,13 @@
include_directories(../../include)

if (${ARCH} STREQUAL "x86_64")
set(
CIPHER_ARCH_SOURCES

chacha20_poly1305_x86_64.${ASM_EXT}
)
endif()

add_library(
cipher

@@ -19,6 +27,8 @@ add_library(
tls_cbc.c
e_tls.c
e_ssl3.c

${CIPHER_ARCH_SOURCES}
)

add_executable(
@@ -35,6 +45,8 @@ add_executable(
$<TARGET_OBJECTS:test_support>
)

perlasm(chacha20_poly1305_x86_64.${ASM_EXT} asm/chacha20_poly1305_x86_64.pl)

target_link_libraries(cipher_test crypto)
target_link_libraries(aead_test crypto)
add_dependencies(all_tests cipher_test aead_test)

+ 2371
- 0
crypto/cipher/asm/chacha20_poly1305_x86_64.pl
File diff suppressed because it is too large
View File


+ 58
- 6
crypto/cipher/e_chacha20poly1305.c View File

@@ -33,6 +33,42 @@ struct aead_chacha20_poly1305_ctx {
unsigned char tag_len;
};

#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
!defined(OPENSSL_WINDOWS)
static const int kHaveAsm = 1;
// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It
// decrypts |plaintext_len| bytes from |ciphertext| and writes them to
// |out_plaintext|. On entry, |aead_data| must contain the final 48 bytes of
// the initial ChaCha20 block, i.e. the key, followed by four zeros, followed
// by the nonce. On exit, it will contain the calculated tag value, which the
// caller must check.
void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext,
size_t plaintext_len, const uint8_t *ad,
size_t ad_len, uint8_t *aead_data);

// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It
// encrypts |plaintext_len| bytes from |plaintext| and writes them to
// |out_ciphertext|. On entry, |aead_data| must contain the final 48 bytes of
// the initial ChaCha20 block, i.e. the key, followed by four zeros, followed
// by the nonce. On exit, it will contain the calculated tag value, which the
// caller must append to the ciphertext.
void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
size_t plaintext_len, const uint8_t *ad,
size_t ad_len, uint8_t *aead_data);
#else
static const int kHaveAsm = 0;

static void chacha20_poly1305_open(uint8_t *out_plaintext,
const uint8_t *ciphertext,
size_t plaintext_len, const uint8_t *ad,
size_t ad_len, uint8_t *aead_data) {}

static void chacha20_poly1305_seal(uint8_t *out_ciphertext,
const uint8_t *plaintext,
size_t plaintext_len, const uint8_t *ad,
size_t ad_len, uint8_t *aead_data) {}
#endif

static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t tag_len) {
struct aead_chacha20_poly1305_ctx *c20_ctx;
@@ -143,10 +179,17 @@ static int aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, uint8_t *out,
return 0;
}

CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
alignas(16) uint8_t tag[48];

alignas(16) uint8_t tag[POLY1305_TAG_LEN];
calc_tag(tag, c20_ctx, nonce, ad, ad_len, out, in_len);
if (kHaveAsm) {
OPENSSL_memcpy(tag, c20_ctx->key, 32);
OPENSSL_memset(tag + 32, 0, 4);
OPENSSL_memcpy(tag + 32 + 4, nonce, 12);
chacha20_poly1305_seal(out, in, in_len, ad, ad_len, tag);
} else {
CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
calc_tag(tag, c20_ctx, nonce, ad, ad_len, out, in_len);
}

OPENSSL_memcpy(out + in_len, tag, c20_ctx->tag_len);
*out_len = in_len + c20_ctx->tag_len;
@@ -184,14 +227,23 @@ static int aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, uint8_t *out,
}

plaintext_len = in_len - c20_ctx->tag_len;
alignas(16) uint8_t tag[POLY1305_TAG_LEN];
calc_tag(tag, c20_ctx, nonce, ad, ad_len, in, plaintext_len);
alignas(16) uint8_t tag[48];

if (kHaveAsm) {
OPENSSL_memcpy(tag, c20_ctx->key, 32);
OPENSSL_memset(tag + 32, 0, 4);
OPENSSL_memcpy(tag + 32 + 4, nonce, 12);
chacha20_poly1305_open(out, in, plaintext_len, ad, ad_len, tag);
} else {
calc_tag(tag, c20_ctx, nonce, ad, ad_len, in, plaintext_len);
CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
}

if (CRYPTO_memcmp(tag, in + plaintext_len, c20_ctx->tag_len) != 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
}

CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
*out_len = plaintext_len;
return 1;
}


Loading…
Cancel
Save