From 3c4a5cbb71453c5d61314a3f76a5ca6f123dbf94 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Tue, 29 Mar 2016 17:43:31 -0400 Subject: [PATCH] Revert "Enable upstream's Poly1305 code." This reverts commit 6f0c4db90e47437ed87a2d385c7797e692a2cf65 except for the imported assembly files, which are left as-is but unused. Until upstream fixes https://rt.openssl.org/Ticket/Display.html?id=4483, we shouldn't ship this code. Once that bug has been fixed, we'll restore it. Change-Id: I74aea18ce31a4b79657d04f8589c18d6b17f1578 Reviewed-on: https://boringssl-review.googlesource.com/7602 Reviewed-by: Emily Stark (Dunn) Reviewed-by: David Benjamin --- crypto/poly1305/CMakeLists.txt | 33 +- crypto/poly1305/internal.h | 40 + crypto/poly1305/poly1305.c | 380 +++--- crypto/poly1305/poly1305_arm.c | 304 +++++ crypto/poly1305/poly1305_arm_asm.S | 2015 ++++++++++++++++++++++++++++ crypto/poly1305/poly1305_vec.c | 890 ++++++++++++ include/openssl/poly1305.h | 5 +- util/generate_build_files.py | 1 + 8 files changed, 3443 insertions(+), 225 deletions(-) create mode 100644 crypto/poly1305/internal.h create mode 100644 crypto/poly1305/poly1305_arm.c create mode 100644 crypto/poly1305/poly1305_arm_asm.S create mode 100644 crypto/poly1305/poly1305_vec.c diff --git a/crypto/poly1305/CMakeLists.txt b/crypto/poly1305/CMakeLists.txt index 3dd40a9a..1b6a8049 100644 --- a/crypto/poly1305/CMakeLists.txt +++ b/crypto/poly1305/CMakeLists.txt @@ -4,31 +4,7 @@ if (${ARCH} STREQUAL "arm") set( POLY1305_ARCH_SOURCES - poly1305-armv4.${ASM_EXT} - ) -endif() - -if (${ARCH} STREQUAL "aarch64") - set( - POLY1305_ARCH_SOURCES - - poly1305-armv8.${ASM_EXT} - ) -endif() - -if (${ARCH} STREQUAL "x86") - set( - POLY1305_ARCH_SOURCES - - poly1305-x86.${ASM_EXT} - ) -endif() - -if (${ARCH} STREQUAL "x86_64") - set( - POLY1305_ARCH_SOURCES - - poly1305-x86_64.${ASM_EXT} + poly1305_arm_asm.S ) endif() @@ -38,6 +14,8 @@ add_library( OBJECT poly1305.c + poly1305_arm.c + poly1305_vec.c ${POLY1305_ARCH_SOURCES} ) @@ -51,8 +29,3 @@ add_executable( target_link_libraries(poly1305_test crypto) add_dependencies(all_tests poly1305_test) - -perlasm(poly1305-armv4.${ASM_EXT} asm/poly1305-armv4.pl) -perlasm(poly1305-armv8.${ASM_EXT} asm/poly1305-armv8.pl) -perlasm(poly1305-x86.${ASM_EXT} asm/poly1305-x86.pl) -perlasm(poly1305-x86_64.${ASM_EXT} asm/poly1305-x86_64.pl) diff --git a/crypto/poly1305/internal.h b/crypto/poly1305/internal.h new file mode 100644 index 00000000..df6769ea --- /dev/null +++ b/crypto/poly1305/internal.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2016, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_POLY1305_INTERNAL_H +#define OPENSSL_HEADER_POLY1305_INTERNAL_H + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) +void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]); + +void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in, + size_t in_len); + +void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]); +#endif + + +#if defined(__cplusplus) +} /* extern C */ +#endif + +#endif /* OPENSSL_HEADER_POLY1305_INTERNAL_H */ diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c index de87c5bc..dc2d6a68 100644 --- a/crypto/poly1305/poly1305.c +++ b/crypto/poly1305/poly1305.c @@ -18,36 +18,16 @@ #include -#include #include -#include +#include -#include "../internal.h" +#include "internal.h" -#define POLY1305_BLOCK_STATE_SIZE 192 - -typedef void (*poly1305_blocks_t)(void *ctx, const uint8_t *in, size_t len, - uint32_t padbit); -typedef void (*poly1305_emit_t)(void *ctx, uint8_t mac[16], - const uint32_t nonce[4]); - -struct poly1305_state_st { - alignas(8) uint8_t opaque[POLY1305_BLOCK_STATE_SIZE]; - uint32_t nonce[4]; - uint8_t buf[16]; - unsigned buf_used; - struct { - poly1305_blocks_t blocks; - poly1305_emit_t emit; - } func; -}; - -OPENSSL_COMPILE_ASSERT(sizeof(poly1305_state) >= - sizeof(struct poly1305_state_st), - poly1305_state_too_small); +#if defined(OPENSSL_WINDOWS) || !defined(OPENSSL_X86_64) +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM) /* We can assume little-endian. */ static uint32_t U8TO32_LE(const uint8_t *m) { uint32_t r; @@ -55,57 +35,140 @@ static uint32_t U8TO32_LE(const uint8_t *m) { return r; } -#if !defined(OPENSSL_NO_ASM) -#if defined(OPENSSL_X86) -/* See comment above |_poly1305_init_sse2| in poly1305-x86.pl. */ -OPENSSL_COMPILE_ASSERT(POLY1305_BLOCK_STATE_SIZE >= 4 * (5 + 1 + 4 + 2 + 4 * 9), - poly1305_block_state_too_small); -#define POLY1305_ASM -#elif defined(OPENSSL_X86_64) -/* See comment above |__poly1305_block| in poly1305-x86_64.pl. */ -OPENSSL_COMPILE_ASSERT(POLY1305_BLOCK_STATE_SIZE >= - 4 * (5 + 1 + 2 * 2 + 2 + 4 * 9), - poly1305_block_state_too_small); -#define POLY1305_ASM -#elif defined(OPENSSL_ARM) -/* TODO(davidben): Figure out the layout of the struct. For now, - * |POLY1305_BLOCK_STATE_SIZE| is taken from OpenSSL. */ -#define POLY1305_ASM -#elif defined(OPENSSL_AARCH64) -/* TODO(davidben): Figure out the layout of the struct. For now, - * |POLY1305_BLOCK_STATE_SIZE| is taken from OpenSSL. */ -#define POLY1305_ASM -#endif -#endif - -#if defined(POLY1305_ASM) +static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); } +#else +static uint32_t U8TO32_LE(const uint8_t *m) { + return (uint32_t)m[0] | (uint32_t)m[1] << 8 | (uint32_t)m[2] << 16 | + (uint32_t)m[3] << 24; +} -int poly1305_init(void *ctx, const uint8_t key[16], void *out_func); -void poly1305_blocks(void *ctx, const uint8_t *in, size_t len, - uint32_t padbit); -void poly1305_emit(void *ctx, uint8_t mac[16], const uint32_t nonce[4]); +static void U32TO8_LE(uint8_t *m, uint32_t v) { + m[0] = v; + m[1] = v >> 8; + m[2] = v >> 16; + m[3] = v >> 24; +} +#endif -#else +static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } -struct poly1305_block_state_st { +struct poly1305_state_st { uint32_t r0, r1, r2, r3, r4; uint32_t s1, s2, s3, s4; uint32_t h0, h1, h2, h3, h4; + uint8_t buf[16]; + unsigned int buf_used; + uint8_t key[16]; }; -OPENSSL_COMPILE_ASSERT(POLY1305_BLOCK_STATE_SIZE >= - sizeof(struct poly1305_block_state_st), - poly1305_block_state_too_small); +/* poly1305_blocks updates |state| given some amount of input data. This + * function may only be called with a |len| that is not a multiple of 16 at the + * end of the data. Otherwise the input must be buffered into 16 byte blocks. */ +static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t len) { + uint32_t t0, t1, t2, t3; + uint64_t t[5]; + uint32_t b; + uint64_t c; + size_t j; + uint8_t mp[16]; -/* We can assume little-endian. */ -static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); } + if (len < 16) { + goto poly1305_donna_atmost15bytes; + } -static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } +poly1305_donna_16bytes: + t0 = U8TO32_LE(in); + t1 = U8TO32_LE(in + 4); + t2 = U8TO32_LE(in + 8); + t3 = U8TO32_LE(in + 12); + + in += 16; + len -= 16; + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8) | (1 << 24); + +poly1305_donna_mul: + t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + + mul32x32_64(state->h4, state->s1); + t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + + mul32x32_64(state->h4, state->s2); + t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + + mul32x32_64(state->h4, state->s3); + t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + + mul32x32_64(state->h4, state->s4); + t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + + mul32x32_64(state->h4, state->r0); + + state->h0 = (uint32_t)t[0] & 0x3ffffff; + c = (t[0] >> 26); + t[1] += c; + state->h1 = (uint32_t)t[1] & 0x3ffffff; + b = (uint32_t)(t[1] >> 26); + t[2] += b; + state->h2 = (uint32_t)t[2] & 0x3ffffff; + b = (uint32_t)(t[2] >> 26); + t[3] += b; + state->h3 = (uint32_t)t[3] & 0x3ffffff; + b = (uint32_t)(t[3] >> 26); + t[4] += b; + state->h4 = (uint32_t)t[4] & 0x3ffffff; + b = (uint32_t)(t[4] >> 26); + state->h0 += b * 5; + + if (len >= 16) { + goto poly1305_donna_16bytes; + } + +/* final bytes */ +poly1305_donna_atmost15bytes: + if (!len) { + return; + } + + for (j = 0; j < len; j++) { + mp[j] = in[j]; + } + mp[j++] = 1; + for (; j < 16; j++) { + mp[j] = 0; + } + len = 0; + + t0 = U8TO32_LE(mp + 0); + t1 = U8TO32_LE(mp + 4); + t2 = U8TO32_LE(mp + 8); + t3 = U8TO32_LE(mp + 12); + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8); + + goto poly1305_donna_mul; +} -static int poly1305_init(void *ctx, const uint8_t key[16], void *out_func) { - struct poly1305_block_state_st *state = (struct poly1305_block_state_st *)ctx; +void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { + struct poly1305_state_st *state = (struct poly1305_state_st *)statep; uint32_t t0, t1, t2, t3; +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_poly1305_init_neon(statep, key); + return; + } +#endif + t0 = U8TO32_LE(key + 0); t1 = U8TO32_LE(key + 4); t2 = U8TO32_LE(key + 8); @@ -137,80 +200,72 @@ static int poly1305_init(void *ctx, const uint8_t key[16], void *out_func) { state->h3 = 0; state->h4 = 0; - return 0; + state->buf_used = 0; + memcpy(state->key, key + 16, sizeof(state->key)); } -static void poly1305_blocks(void *ctx, const uint8_t *in, size_t len, - uint32_t padbit) { - struct poly1305_block_state_st *state = (struct poly1305_block_state_st *)ctx; - uint32_t t0, t1, t2, t3; - uint64_t t[5]; - uint32_t b; - uint64_t c; +void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, + size_t in_len) { + unsigned int i; + struct poly1305_state_st *state = (struct poly1305_state_st *)statep; + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_poly1305_update_neon(statep, in, in_len); + return; + } +#endif + + if (state->buf_used) { + unsigned int todo = 16 - state->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (i = 0; i < todo; i++) { + state->buf[state->buf_used + i] = in[i]; + } + state->buf_used += todo; + in_len -= todo; + in += todo; + + if (state->buf_used == 16) { + poly1305_update(state, state->buf, 16); + state->buf_used = 0; + } + } + + if (in_len >= 16) { + size_t todo = in_len & ~0xf; + poly1305_update(state, in, todo); + in += todo; + in_len &= 0xf; + } - assert(len % 16 == 0); - assert(padbit != 0 || len == 16); - - while (len >= 16) { - t0 = U8TO32_LE(in); - t1 = U8TO32_LE(in + 4); - t2 = U8TO32_LE(in + 8); - t3 = U8TO32_LE(in + 12); - - in += 16; - len -= 16; - - state->h0 += t0 & 0x3ffffff; - state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; - state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; - state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; - state->h4 += (t3 >> 8) | (padbit << 24); - - t[0] = - mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + - mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + - mul32x32_64(state->h4, state->s1); - t[1] = - mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + - mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + - mul32x32_64(state->h4, state->s2); - t[2] = - mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + - mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + - mul32x32_64(state->h4, state->s3); - t[3] = - mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + - mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + - mul32x32_64(state->h4, state->s4); - t[4] = - mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + - mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + - mul32x32_64(state->h4, state->r0); - - state->h0 = (uint32_t)t[0] & 0x3ffffff; - c = (t[0] >> 26); - t[1] += c; - state->h1 = (uint32_t)t[1] & 0x3ffffff; - b = (uint32_t)(t[1] >> 26); - t[2] += b; - state->h2 = (uint32_t)t[2] & 0x3ffffff; - b = (uint32_t)(t[2] >> 26); - t[3] += b; - state->h3 = (uint32_t)t[3] & 0x3ffffff; - b = (uint32_t)(t[3] >> 26); - t[4] += b; - state->h4 = (uint32_t)t[4] & 0x3ffffff; - b = (uint32_t)(t[4] >> 26); - state->h0 += b * 5; + if (in_len) { + for (i = 0; i < in_len; i++) { + state->buf[i] = in[i]; + } + state->buf_used = in_len; } } -static void poly1305_emit(void *ctx, uint8_t mac[16], const uint32_t nonce[4]) { - struct poly1305_block_state_st *state = (struct poly1305_block_state_st *)ctx; +void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { + struct poly1305_state_st *state = (struct poly1305_state_st *)statep; uint64_t f0, f1, f2, f3; uint32_t g0, g1, g2, g3, g4; uint32_t b, nb; +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_poly1305_finish_neon(statep, mac); + return; + } +#endif + + if (state->buf_used) { + poly1305_update(state, state->buf, state->buf_used); + } + b = state->h0 >> 26; state->h0 = state->h0 & 0x3ffffff; state->h1 += b; @@ -249,10 +304,13 @@ static void poly1305_emit(void *ctx, uint8_t mac[16], const uint32_t nonce[4]) { state->h3 = (state->h3 & nb) | (g3 & b); state->h4 = (state->h4 & nb) | (g4 & b); - f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)nonce[0]; - f1 = ((state->h1 >> 6) | (state->h2 << 20)) + (uint64_t)nonce[1]; - f2 = ((state->h2 >> 12) | (state->h3 << 14)) + (uint64_t)nonce[2]; - f3 = ((state->h3 >> 18) | (state->h4 << 8)) + (uint64_t)nonce[3]; + f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); + f1 = ((state->h1 >> 6) | (state->h2 << 20)) + + (uint64_t)U8TO32_LE(&state->key[4]); + f2 = ((state->h2 >> 12) | (state->h3 << 14)) + + (uint64_t)U8TO32_LE(&state->key[8]); + f3 = ((state->h3 >> 18) | (state->h4 << 8)) + + (uint64_t)U8TO32_LE(&state->key[12]); U32TO8_LE(&mac[0], f0); f1 += (f0 >> 32); @@ -263,64 +321,4 @@ static void poly1305_emit(void *ctx, uint8_t mac[16], const uint32_t nonce[4]) { U32TO8_LE(&mac[12], f3); } -#endif /* !POLY1305_ASM */ - -void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { - struct poly1305_state_st *state = (struct poly1305_state_st *)statep; - - if (!poly1305_init(state->opaque, key, &state->func)) { - state->func.blocks = poly1305_blocks; - state->func.emit = poly1305_emit; - } - - state->buf_used = 0; - state->nonce[0] = U8TO32_LE(key + 16); - state->nonce[1] = U8TO32_LE(key + 20); - state->nonce[2] = U8TO32_LE(key + 24); - state->nonce[3] = U8TO32_LE(key + 28); -} - -void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, - size_t in_len) { - struct poly1305_state_st *state = (struct poly1305_state_st *)statep; - - if (state->buf_used != 0) { - unsigned todo = 16 - state->buf_used; - if (todo > in_len) { - todo = in_len; - } - memcpy(state->buf + state->buf_used, in, todo); - state->buf_used += todo; - in_len -= todo; - in += todo; - - if (state->buf_used == 16) { - state->func.blocks(state->opaque, state->buf, 16, 1 /* pad */); - state->buf_used = 0; - } - } - - if (in_len >= 16) { - size_t todo = in_len & ~0xf; - state->func.blocks(state->opaque, in, todo, 1 /* pad */); - in += todo; - in_len &= 0xf; - } - - if (in_len != 0) { - memcpy(state->buf, in, in_len); - state->buf_used = in_len; - } -} - -void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { - struct poly1305_state_st *state = (struct poly1305_state_st *)statep; - - if (state->buf_used != 0) { - state->buf[state->buf_used] = 1; - memset(state->buf + state->buf_used + 1, 0, 16 - state->buf_used - 1); - state->func.blocks(state->opaque, state->buf, 16, 0 /* already padded */); - } - - state->func.emit(state->opaque, mac, state->nonce); -} +#endif /* OPENSSL_WINDOWS || !OPENSSL_X86_64 */ diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c new file mode 100644 index 00000000..de31d6b2 --- /dev/null +++ b/crypto/poly1305/poly1305_arm.c @@ -0,0 +1,304 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* This implementation was taken from the public domain, neon2 version in + * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ + +#include + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) + +#include + +#include "../internal.h" +#include "internal.h" + + +typedef struct { + uint32_t v[12]; /* for alignment; only using 10 */ +} fe1305x2; + +#define addmulmod openssl_poly1305_neon2_addmulmod +#define blocks openssl_poly1305_neon2_blocks + +extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, + const fe1305x2 *c); + +extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in, + unsigned int inlen); + +static void freeze(fe1305x2 *r) { + int i; + + uint32_t x0 = r->v[0]; + uint32_t x1 = r->v[2]; + uint32_t x2 = r->v[4]; + uint32_t x3 = r->v[6]; + uint32_t x4 = r->v[8]; + uint32_t y0; + uint32_t y1; + uint32_t y2; + uint32_t y3; + uint32_t y4; + uint32_t swap; + + for (i = 0; i < 3; ++i) { + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + x0 += 5 * (x4 >> 26); + x4 &= 0x3ffffff; + } + + y0 = x0 + 5; + y1 = x1 + (y0 >> 26); + y0 &= 0x3ffffff; + y2 = x2 + (y1 >> 26); + y1 &= 0x3ffffff; + y3 = x3 + (y2 >> 26); + y2 &= 0x3ffffff; + y4 = x4 + (y3 >> 26); + y3 &= 0x3ffffff; + swap = -(y4 >> 26); + y4 &= 0x3ffffff; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + y0 &= swap; + y1 &= swap; + y2 &= swap; + y3 &= swap; + y4 &= swap; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + r->v[0] = y0; + r->v[2] = y1; + r->v[4] = y2; + r->v[6] = y3; + r->v[8] = y4; +} + +static void fe1305x2_tobytearray(uint8_t *r, fe1305x2 *x) { + uint32_t x0 = x->v[0]; + uint32_t x1 = x->v[2]; + uint32_t x2 = x->v[4]; + uint32_t x3 = x->v[6]; + uint32_t x4 = x->v[8]; + + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + + *(uint32_t *)r = x0 + (x1 << 26); + *(uint32_t *)(r + 4) = (x1 >> 6) + (x2 << 20); + *(uint32_t *)(r + 8) = (x2 >> 12) + (x3 << 14); + *(uint32_t *)(r + 12) = (x3 >> 18) + (x4 << 8); +} + +/* load32 exists to avoid breaking strict aliasing rules in + * fe1305x2_frombytearray. */ +static uint32_t load32(uint8_t *t) { + uint32_t tmp; + memcpy(&tmp, t, sizeof(tmp)); + return tmp; +} + +static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, + unsigned long long xlen) { + unsigned i; + uint8_t t[17]; + + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + xlen -= i; + x += i; + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[0] = 0x3ffffff & load32(t); + r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); + r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); + r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); + r->v[8] = load32(t + 13); + + if (xlen) { + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[1] = 0x3ffffff & load32(t); + r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); + r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); + r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); + r->v[9] = load32(t + 13); + } else { + r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; + } +} + +static const alignas(16) fe1305x2 zero; + +struct poly1305_state_st { + uint8_t data[sizeof(fe1305x2[5]) + 128]; + uint8_t buf[32]; + unsigned int buf_used; + uint8_t key[16]; +}; + +void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) { + struct poly1305_state_st *st = (struct poly1305_state_st *)(state); + fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); + fe1305x2 *const h = r + 1; + fe1305x2 *const c = h + 1; + fe1305x2 *const precomp = c + 1; + unsigned int j; + + r->v[1] = r->v[0] = 0x3ffffff & *(uint32_t *)key; + r->v[3] = r->v[2] = 0x3ffff03 & ((*(uint32_t *)(key + 3)) >> 2); + r->v[5] = r->v[4] = 0x3ffc0ff & ((*(uint32_t *)(key + 6)) >> 4); + r->v[7] = r->v[6] = 0x3f03fff & ((*(uint32_t *)(key + 9)) >> 6); + r->v[9] = r->v[8] = 0x00fffff & ((*(uint32_t *)(key + 12)) >> 8); + + for (j = 0; j < 10; j++) { + h->v[j] = 0; /* XXX: should fast-forward a bit */ + } + + addmulmod(precomp, r, r, &zero); /* precompute r^2 */ + addmulmod(precomp + 1, precomp, precomp, &zero); /* precompute r^4 */ + + memcpy(st->key, key + 16, 16); + st->buf_used = 0; +} + +void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in, + size_t in_len) { + struct poly1305_state_st *st = (struct poly1305_state_st *)(state); + fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); + fe1305x2 *const h = r + 1; + fe1305x2 *const c = h + 1; + fe1305x2 *const precomp = c + 1; + unsigned int i; + + if (st->buf_used) { + unsigned int todo = 32 - st->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (i = 0; i < todo; i++) { + st->buf[st->buf_used + i] = in[i]; + } + st->buf_used += todo; + in_len -= todo; + in += todo; + + if (st->buf_used == sizeof(st->buf) && in_len) { + addmulmod(h, h, precomp, &zero); + fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); + for (i = 0; i < 10; i++) { + h->v[i] += c->v[i]; + } + st->buf_used = 0; + } + } + + while (in_len > 32) { + unsigned int tlen = 1048576; + if (in_len < tlen) { + tlen = in_len; + } + tlen -= blocks(h, precomp, in, tlen); + in_len -= tlen; + in += tlen; + } + + if (in_len) { + for (i = 0; i < in_len; i++) { + st->buf[i] = in[i]; + } + st->buf_used = in_len; + } +} + +void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) { + struct poly1305_state_st *st = (struct poly1305_state_st *)(state); + fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); + fe1305x2 *const h = r + 1; + fe1305x2 *const c = h + 1; + fe1305x2 *const precomp = c + 1; + + addmulmod(h, h, precomp, &zero); + + if (st->buf_used > 16) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + precomp->v[1] = r->v[1]; + precomp->v[3] = r->v[3]; + precomp->v[5] = r->v[5]; + precomp->v[7] = r->v[7]; + precomp->v[9] = r->v[9]; + addmulmod(h, h, precomp, c); + } else if (st->buf_used > 0) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + r->v[1] = 1; + r->v[3] = 0; + r->v[5] = 0; + r->v[7] = 0; + r->v[9] = 0; + addmulmod(h, h, r, c); + } + + h->v[0] += h->v[1]; + h->v[2] += h->v[3]; + h->v[4] += h->v[5]; + h->v[6] += h->v[7]; + h->v[8] += h->v[9]; + freeze(h); + + fe1305x2_frombytearray(c, st->key, 16); + c->v[8] ^= (1 << 24); + + h->v[0] += c->v[0]; + h->v[2] += c->v[2]; + h->v[4] += c->v[4]; + h->v[6] += c->v[6]; + h->v[8] += c->v[8]; + fe1305x2_tobytearray(mac, h); +} + +#endif /* OPENSSL_ARM && !OPENSSL_NO_ASM */ diff --git a/crypto/poly1305/poly1305_arm_asm.S b/crypto/poly1305/poly1305_arm_asm.S new file mode 100644 index 00000000..e16f83bd --- /dev/null +++ b/crypto/poly1305/poly1305_arm_asm.S @@ -0,0 +1,2015 @@ +#if defined(__arm__) && !defined(OPENSSL_NO_ASM) + +# This implementation was taken from the public domain, neon2 version in +# SUPERCOP by D. J. Bernstein and Peter Schwabe. + +# qhasm: int32 input_0 + +# qhasm: int32 input_1 + +# qhasm: int32 input_2 + +# qhasm: int32 input_3 + +# qhasm: stack32 input_4 + +# qhasm: stack32 input_5 + +# qhasm: stack32 input_6 + +# qhasm: stack32 input_7 + +# qhasm: int32 caller_r4 + +# qhasm: int32 caller_r5 + +# qhasm: int32 caller_r6 + +# qhasm: int32 caller_r7 + +# qhasm: int32 caller_r8 + +# qhasm: int32 caller_r9 + +# qhasm: int32 caller_r10 + +# qhasm: int32 caller_r11 + +# qhasm: int32 caller_r12 + +# qhasm: int32 caller_r14 + +# qhasm: reg128 caller_q4 + +# qhasm: reg128 caller_q5 + +# qhasm: reg128 caller_q6 + +# qhasm: reg128 caller_q7 + +# qhasm: startcode +.fpu neon +.text + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 x01 + +# qhasm: reg128 x23 + +# qhasm: reg128 x4 + +# qhasm: reg128 y0 + +# qhasm: reg128 y12 + +# qhasm: reg128 y34 + +# qhasm: reg128 5y12 + +# qhasm: reg128 5y34 + +# qhasm: stack128 y0_stack + +# qhasm: stack128 y12_stack + +# qhasm: stack128 y34_stack + +# qhasm: stack128 5y12_stack + +# qhasm: stack128 5y34_stack + +# qhasm: reg128 z0 + +# qhasm: reg128 z12 + +# qhasm: reg128 z34 + +# qhasm: reg128 5z12 + +# qhasm: reg128 5z34 + +# qhasm: stack128 z0_stack + +# qhasm: stack128 z12_stack + +# qhasm: stack128 z34_stack + +# qhasm: stack128 5z12_stack + +# qhasm: stack128 5z34_stack + +# qhasm: stack128 two24 + +# qhasm: int32 ptr + +# qhasm: reg128 c01 + +# qhasm: reg128 c23 + +# qhasm: reg128 d01 + +# qhasm: reg128 d23 + +# qhasm: reg128 t0 + +# qhasm: reg128 t1 + +# qhasm: reg128 t2 + +# qhasm: reg128 t3 + +# qhasm: reg128 t4 + +# qhasm: reg128 mask + +# qhasm: reg128 u0 + +# qhasm: reg128 u1 + +# qhasm: reg128 u2 + +# qhasm: reg128 u3 + +# qhasm: reg128 u4 + +# qhasm: reg128 v01 + +# qhasm: reg128 mid + +# qhasm: reg128 v23 + +# qhasm: reg128 v4 + +# qhasm: int32 len + +# qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks +.align 4 +.global openssl_poly1305_neon2_blocks +.hidden openssl_poly1305_neon2_blocks +.type openssl_poly1305_neon2_blocks STT_FUNC +openssl_poly1305_neon2_blocks: +vpush {q4,q5,q6,q7} +mov r12,sp +sub sp,sp,#192 +bic sp,sp,#31 + +# qhasm: len = input_3 +# asm 1: mov >len=int32#4,len=r3,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[input_1=int32#2,input_1=r1,z12=reg128#5%bot->z12=reg128#5%top},[z12=d8->z12=d9},[z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[mask=reg128#7,#0xffffffff +# asm 2: vmov.i64 >mask=q6,#0xffffffff +vmov.i64 q6,#0xffffffff + +# qhasm: 2x u4 = 0xff +# asm 1: vmov.i64 >u4=reg128#8,#0xff +# asm 2: vmov.i64 >u4=q7,#0xff +vmov.i64 q7,#0xff + +# qhasm: x01 aligned= mem128[input_0];input_0+=16 +# asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[x01=d16->x01=d17},[x23=reg128#10%bot->x23=reg128#10%top},[x23=d18->x23=d19},[input_0=int32#1,input_0=r0,>=6 +# asm 1: vshr.u64 >mask=reg128#7,mask=q6,>= 7 +# asm 1: vshr.u64 >u4=reg128#8,u4=q7,5y12=reg128#12,5y12=q11,5y34=reg128#13,5y34=q12,5y12=reg128#12,<5y12=reg128#12,5y12=q11,<5y12=q11,5y34=reg128#13,<5y34=reg128#13,5y34=q12,<5y34=q12,u4=reg128#8,u4=q7,5z12=reg128#14,5z12=q13,5z34=reg128#15,5z34=q14,5z12=reg128#14,<5z12=reg128#14,5z12=q13,<5z12=q13,5z34=reg128#15,<5z34=reg128#15,5z34=q14,<5z34=q14,ptr=int32#2,ptr=r1,r4=reg128#16,r4=q15,r0=reg128#8,r0=q7,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,<5y12_stack=stack128#5 +# asm 2: lea >ptr=r1,<5y12_stack=[sp,#64] +add r1,sp,#64 + +# qhasm: mem128[ptr] aligned= 5y12 +# asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[ptr=int32#2,<5y34_stack=stack128#6 +# asm 2: lea >ptr=r1,<5y34_stack=[sp,#80] +add r1,sp,#80 + +# qhasm: mem128[ptr] aligned= 5y34 +# asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[ptr=int32#2,<5z12_stack=stack128#10 +# asm 2: lea >ptr=r1,<5z12_stack=[sp,#144] +add r1,sp,#144 + +# qhasm: mem128[ptr] aligned= 5z12 +# asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[ptr=int32#2,<5z34_stack=stack128#11 +# asm 2: lea >ptr=r1,<5z34_stack=[sp,#160] +add r1,sp,#160 + +# qhasm: mem128[ptr] aligned= 5z34 +# asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[? len - 64 +# asm 1: cmp +bls ._below64bytes + +# qhasm: input_2 += 32 +# asm 1: add >input_2=int32#2,input_2=r1,c01=reg128#1%bot->c01=reg128#1%top},[c01=d0->c01=d1},[c23=reg128#2%bot->c23=reg128#2%top},[c23=d2->c23=d3},[ptr=int32#3,ptr=r2,z12=reg128#3%bot->z12=reg128#3%top},[z12=d4->z12=d5},[ptr=int32#3,ptr=r2,z0=reg128#4%bot->z0=reg128#4%top},[z0=d6->z0=d7},[r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,ptr=int32#3,<5z34_stack=stack128#11 +# asm 2: lea >ptr=r2,<5z34_stack=[sp,#160] +add r2,sp,#160 + +# qhasm: 5z34 aligned= mem128[ptr] +# asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[5z34=d10->5z34=d11},[r0=reg128#8,r0=q7,r2=reg128#14,r2=q13,d01=reg128#12%bot->d01=reg128#12%top},[d01=d22->d01=d23},[r1=reg128#15,r1=q14,ptr=int32#3,<5z12_stack=stack128#10 +# asm 2: lea >ptr=r2,<5z12_stack=[sp,#144] +add r2,sp,#144 + +# qhasm: 5z12 aligned= mem128[ptr] +# asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[5z12=d0->5z12=d1},[d23=reg128#2%bot->d23=reg128#2%top},[d23=d2->d23=d3},[input_2=int32#2,input_2=r1,> 40 +# asm 1: vshr.u64 >v4=reg128#4,v4=q3,> 14; v23[3] = d23[2,3] unsigned>> 14 +# asm 1: vshrn.u64 > 26; v01[3] = d01[2,3] unsigned>> 26 +# asm 1: vshrn.u64 > 20; v23[1] = mid[2,3] unsigned>> 20 +# asm 1: vshrn.u64 ptr=int32#3,ptr=r2,y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[ptr=int32#3,ptr=r2,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[ptr=int32#3,ptr=r2,y0=reg128#1%bot->y0=reg128#1%top},[y0=d0->y0=d1},[ptr=int32#3,<5y34_stack=stack128#6 +# asm 2: lea >ptr=r2,<5y34_stack=[sp,#80] +add r2,sp,#80 + +# qhasm: 5y34 aligned= mem128[ptr] +# asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[5y34=d24->5y34=d25},[ptr=int32#3,<5y12_stack=stack128#5 +# asm 2: lea >ptr=r2,<5y12_stack=[sp,#64] +add r2,sp,#64 + +# qhasm: 5y12 aligned= mem128[ptr] +# asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[5y12=d22->5y12=d23},[ptr=int32#3,ptr=r2,> 26 +# asm 1: vshr.u64 >t1=reg128#4,t1=q3,len=int32#4,len=r3,r0=reg128#6,r0=q5,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#5,r3=q4,x4=reg128#8,x4=q7,r4=reg128#16%bot->r4=reg128#16%top},[r4=d30->r4=d31},[> 26 +# asm 1: vshr.u64 >t2=reg128#9,t2=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t0=reg128#10,t0=q9,r2=reg128#9,r2=q8,x4=reg128#11,x4=q10,x01=reg128#6,x01=q5,r0=reg128#8%bot->r0=reg128#8%top},[r0=d14->r0=d15},[ptr=int32#3,ptr=r2,t0=reg128#10,t0=q9,> 26 +# asm 1: vshr.u64 >t3=reg128#14,t3=q13,x01=reg128#15,x01=q14,z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[x23=reg128#10,x23=q9,r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,> 26 +# asm 1: vshr.u64 >t1=reg128#14,t1=q13,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#14,t4=q13,r3=reg128#5,r3=q4,x4=reg128#11,x4=q10,? len - 64 +# asm 1: cmp +bhi ._mainloop2 + +# qhasm: input_2 -= 32 +# asm 1: sub >input_2=int32#3,input_2=r2,? len - 32 +# asm 1: cmp +bls ._end + +# qhasm: mainloop: +._mainloop: + +# qhasm: new r0 + +# qhasm: ptr = &two24 +# asm 1: lea >ptr=int32#2,ptr=r1,r4=reg128#5%bot->r4=reg128#5%top},[r4=d8->r4=d9},[u4=reg128#6%bot->u4=reg128#6%top},[u4=d10->u4=d11},[c01=reg128#8%bot->c01=reg128#8%top},[c01=d14->c01=d15},[c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[r0=reg128#4,r0=q3,r3=reg128#6,r3=q5,r1=reg128#14,r1=q13,r2=reg128#8,r2=q7,> 26 +# asm 1: vshr.u64 >t1=reg128#9,t1=q8,r0=reg128#4,r0=q3,r1=reg128#9,r1=q8,> 26 +# asm 1: vshr.u64 >t4=reg128#10,t4=q9,r3=reg128#6,r3=q5,r4=reg128#5,r4=q4,> 26 +# asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#11,r1=q10,> 26 +# asm 1: vshr.u64 >t0=reg128#9,t0=q8,r2=reg128#8,r2=q7,r4=reg128#5,r4=q4,r0=reg128#4,r0=q3,t0=reg128#9,t0=q8,> 26 +# asm 1: vshr.u64 >t3=reg128#14,t3=q13,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#6,r3=q5,> 26 +# asm 1: vshr.u64 >t1=reg128#8,t1=q7,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#6,r3=q5,x4=reg128#11,x4=q10,len=int32#4,len=r3,? len - 32 +# asm 1: cmp +bhi ._mainloop + +# qhasm: end: +._end: + +# qhasm: mem128[input_0] = x01;input_0+=16 +# asm 1: vst1.8 {len=int32#1,len=r0,mask=reg128#1,#0xffffffff +# asm 2: vmov.i64 >mask=q0,#0xffffffff +vmov.i64 q0,#0xffffffff + +# qhasm: y01 aligned= mem128[input_2];input_2+=16 +# asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[y01=d2->y01=d3},[_5y01=reg128#3,_5y01=q2,y23=reg128#4%bot->y23=reg128#4%top},[y23=d6->y23=d7},[_5y23=reg128#9,_5y23=q8,_5y4=reg128#11,_5y4=q10,x01=reg128#12%bot->x01=reg128#12%top},[x01=d22->x01=d23},[_5y01=reg128#3,<_5y01=reg128#3,_5y01=q2,<_5y01=q2,x23=reg128#13%bot->x23=reg128#13%top},[x23=d24->x23=d25},[_5y23=reg128#9,<_5y23=reg128#9,_5y23=q8,<_5y23=q8,_5y4=reg128#11,<_5y4=reg128#11,_5y4=q10,<_5y4=q10,c01=reg128#14%bot->c01=reg128#14%top},[c01=d26->c01=d27},[x01=reg128#12,x01=q11,c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[x23=reg128#13,x23=q12,>=6 +# asm 1: vshr.u64 >mask=reg128#1,mask=q0,x4=reg128#14,x4=q13,r0=reg128#15,r0=q14,r1=reg128#3,r1=q2,r2=reg128#16,r2=q15,r3=reg128#9,r3=q8,r4=reg128#10,r4=q9,> 26 +# asm 1: vshr.u64 >t1=reg128#2,t1=q1,r0=reg128#4,r0=q3,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t4=reg128#3,t4=q2,r3=reg128#9,r3=q8,r4=reg128#3,r4=q2,> 26 +# asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t0=reg128#11,t0=q10,r2=reg128#10,r2=q9,r4=reg128#3,r4=q2,r0=reg128#4,r0=q3,t0=reg128#11,t0=q10,> 26 +# asm 1: vshr.u64 >t3=reg128#12,t3=q11,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#9,r3=q8,> 26 +# asm 1: vshr.u64 >t1=reg128#11,t1=q10,x01=reg128#4,x01=q3,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t4=reg128#11,t4=q10,r3=reg128#1,r3=q0,x4=reg128#3,x4=q2, + +#include "../internal.h" + + +#if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64) + +#include + +#define U8TO64_LE(m) (*(const uint64_t *)(m)) +#define U8TO32_LE(m) (*(const uint32_t *)(m)) +#define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v + +typedef __m128i xmmi; + +static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = { + (1 << 26) - 1, 0, (1 << 26) - 1, 0}; +static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; +static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = { + (1 << 24), 0, (1 << 24), 0}; + +static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; } + +static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; } + +static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) { + return (uint128_t)a * b; +} + +static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; } + +static inline uint64_t shr128(uint128_t v, const int shift) { + return (uint64_t)(v >> shift); +} + +static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) { + return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); +} + +typedef struct poly1305_power_t { + union { + xmmi v; + uint64_t u[2]; + uint32_t d[4]; + } R20, R21, R22, R23, R24, S21, S22, S23, S24; +} poly1305_power; + +typedef struct poly1305_state_internal_t { + poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 + bytes of free storage */ + union { + xmmi H[5]; /* 80 bytes */ + uint64_t HH[10]; + }; + /* uint64_t r0,r1,r2; [24 bytes] */ + /* uint64_t pad0,pad1; [16 bytes] */ + uint64_t started; /* 8 bytes */ + uint64_t leftover; /* 8 bytes */ + uint8_t buffer[64]; /* 64 bytes */ +} poly1305_state_internal; /* 448 bytes total + 63 bytes for + alignment = 511 bytes raw */ + +static inline poly1305_state_internal *poly1305_aligned_state( + poly1305_state *state) { + return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); +} + +/* copy 0-63 bytes */ +static inline void +poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) { + size_t offset = src - dst; + if (bytes & 32) { + _mm_storeu_si128((xmmi *)(dst + 0), + _mm_loadu_si128((const xmmi *)(dst + offset + 0))); + _mm_storeu_si128((xmmi *)(dst + 16), + _mm_loadu_si128((const xmmi *)(dst + offset + 16))); + dst += 32; + } + if (bytes & 16) { + _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((const xmmi *)(dst + offset))); + dst += 16; + } + if (bytes & 8) { + *(uint64_t *)dst = *(const uint64_t *)(dst + offset); + dst += 8; + } + if (bytes & 4) { + *(uint32_t *)dst = *(const uint32_t *)(dst + offset); + dst += 4; + } + if (bytes & 2) { + *(uint16_t *)dst = *(uint16_t *)(dst + offset); + dst += 2; + } + if (bytes & 1) { + *(uint8_t *)dst = *(uint8_t *)(dst + offset); + } +} + +/* zero 0-15 bytes */ +static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) { + if (bytes & 8) { + *(uint64_t *)dst = 0; + dst += 8; + } + if (bytes & 4) { + *(uint32_t *)dst = 0; + dst += 4; + } + if (bytes & 2) { + *(uint16_t *)dst = 0; + dst += 2; + } + if (bytes & 1) { + *(uint8_t *)dst = 0; + } +} + +static inline size_t poly1305_min(size_t a, size_t b) { + return (a < b) ? a : b; +} + +void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { + poly1305_state_internal *st = poly1305_aligned_state(state); + poly1305_power *p; + uint64_t r0, r1, r2; + uint64_t t0, t1; + + /* clamp key */ + t0 = U8TO64_LE(key + 0); + t1 = U8TO64_LE(key + 8); + r0 = t0 & 0xffc0fffffff; + t0 >>= 44; + t0 |= t1 << 20; + r1 = t0 & 0xfffffc0ffff; + t1 >>= 24; + r2 = t1 & 0x00ffffffc0f; + + /* store r in un-used space of st->P[1] */ + p = &st->P[1]; + p->R20.d[1] = (uint32_t)(r0); + p->R20.d[3] = (uint32_t)(r0 >> 32); + p->R21.d[1] = (uint32_t)(r1); + p->R21.d[3] = (uint32_t)(r1 >> 32); + p->R22.d[1] = (uint32_t)(r2); + p->R22.d[3] = (uint32_t)(r2 >> 32); + + /* store pad */ + p->R23.d[1] = U8TO32_LE(key + 16); + p->R23.d[3] = U8TO32_LE(key + 20); + p->R24.d[1] = U8TO32_LE(key + 24); + p->R24.d[3] = U8TO32_LE(key + 28); + + /* H = 0 */ + st->H[0] = _mm_setzero_si128(); + st->H[1] = _mm_setzero_si128(); + st->H[2] = _mm_setzero_si128(); + st->H[3] = _mm_setzero_si128(); + st->H[4] = _mm_setzero_si128(); + + st->started = 0; + st->leftover = 0; +} + +static void poly1305_first_block(poly1305_state_internal *st, + const uint8_t *m) { + const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + xmmi T5, T6; + poly1305_power *p; + uint128_t d[3]; + uint64_t r0, r1, r2; + uint64_t r20, r21, r22, s22; + uint64_t pad0, pad1; + uint64_t c; + uint64_t i; + + /* pull out stored info */ + p = &st->P[1]; + + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; + pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; + + /* compute powers r^2,r^4 */ + r20 = r0; + r21 = r1; + r22 = r2; + for (i = 0; i < 2; i++) { + s22 = r22 * (5 << 2); + + d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); + d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); + d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); + + r20 = lo128(d[0]) & 0xfffffffffff; + c = shr128(d[0], 44); + d[1] = add128_64(d[1], c); + r21 = lo128(d[1]) & 0xfffffffffff; + c = shr128(d[1], 44); + d[2] = add128_64(d[2], c); + r22 = lo128(d[2]) & 0x3ffffffffff; + c = shr128(d[2], 42); + r20 += c * 5; + c = (r20 >> 44); + r20 = r20 & 0xfffffffffff; + r21 += c; + + p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R21.v = _mm_shuffle_epi32( + _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R22.v = + _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R23.v = _mm_shuffle_epi32( + _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), + _MM_SHUFFLE(1, 0, 1, 0)); + p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); + p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); + p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); + p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); + p--; + } + + /* put saved info back */ + p = &st->P[1]; + p->R20.d[1] = (uint32_t)(r0); + p->R20.d[3] = (uint32_t)(r0 >> 32); + p->R21.d[1] = (uint32_t)(r1); + p->R21.d[3] = (uint32_t)(r1 >> 32); + p->R22.d[1] = (uint32_t)(r2); + p->R22.d[3] = (uint32_t)(r2 >> 32); + p->R23.d[1] = (uint32_t)(pad0); + p->R23.d[3] = (uint32_t)(pad0 >> 32); + p->R24.d[1] = (uint32_t)(pad1); + p->R24.d[3] = (uint32_t)(pad1 >> 32); + + /* H = [Mx,My] */ + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + st->H[0] = _mm_and_si128(MMASK, T5); + st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + st->H[2] = _mm_and_si128(MMASK, T5); + st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); +} + +static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, + size_t bytes) { + const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + + poly1305_power *p; + xmmi H0, H1, H2, H3, H4; + xmmi T0, T1, T2, T3, T4, T5, T6; + xmmi M0, M1, M2, M3, M4; + xmmi C1, C2; + + H0 = st->H[0]; + H1 = st->H[1]; + H2 = st->H[2]; + H3 = st->H[3]; + H4 = st->H[4]; + + while (bytes >= 64) { + /* H *= [r^4,r^4] */ + p = &st->P[0]; + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + /* H += [Mx,My]*[r^2,r^2] */ + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + p = &st->P[1]; + T5 = _mm_mul_epu32(M0, p->R20.v); + T6 = _mm_mul_epu32(M0, p->R21.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M1, p->S24.v); + T6 = _mm_mul_epu32(M1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M2, p->S23.v); + T6 = _mm_mul_epu32(M2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M3, p->S22.v); + T6 = _mm_mul_epu32(M3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M4, p->S21.v); + T6 = _mm_mul_epu32(M4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M0, p->R22.v); + T6 = _mm_mul_epu32(M0, p->R23.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M1, p->R21.v); + T6 = _mm_mul_epu32(M1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M2, p->R20.v); + T6 = _mm_mul_epu32(M2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M3, p->S24.v); + T6 = _mm_mul_epu32(M3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M4, p->S23.v); + T6 = _mm_mul_epu32(M4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M0, p->R24.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + /* H += [Mx,My] */ + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), + _mm_loadl_epi64((const xmmi *)(m + 48))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), + _mm_loadl_epi64((const xmmi *)(m + 56))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, M4); + + /* reduce */ + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */ + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + m += 64; + bytes -= 64; + } + + st->H[0] = H0; + st->H[1] = H1; + st->H[2] = H2; + st->H[3] = H3; + st->H[4] = H4; +} + +static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, + size_t bytes) { + const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + + poly1305_power *p; + xmmi H0, H1, H2, H3, H4; + xmmi M0, M1, M2, M3, M4; + xmmi T0, T1, T2, T3, T4, T5, T6; + xmmi C1, C2; + + uint64_t r0, r1, r2; + uint64_t t0, t1, t2, t3, t4; + uint64_t c; + size_t consumed = 0; + + H0 = st->H[0]; + H1 = st->H[1]; + H2 = st->H[2]; + H3 = st->H[3]; + H4 = st->H[4]; + + /* p = [r^2,r^2] */ + p = &st->P[1]; + + if (bytes >= 32) { + /* H *= [r^2,r^2] */ + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + /* H += [Mx,My] */ + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, M4); + + /* reduce */ + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + /* H = (H*[r^2,r^2] + [Mx,My]) */ + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + consumed = 32; + } + + /* finalize, H *= [r^2,r] */ + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + + p->R20.d[2] = (uint32_t)(r0)&0x3ffffff; + p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; + p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; + p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; + p->R24.d[2] = (uint32_t)((r2 >> 16)); + p->S21.d[2] = p->R21.d[2] * 5; + p->S22.d[2] = p->R22.d[2] * 5; + p->S23.d[2] = p->R23.d[2] * 5; + p->S24.d[2] = p->R24.d[2] * 5; + + /* H *= [r^2,r] */ + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + /* H = H[0]+H[1] */ + H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); + H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); + H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); + H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); + H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); + + t0 = _mm_cvtsi128_si32(H0); + c = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = _mm_cvtsi128_si32(H1) + c; + c = (t1 >> 26); + t1 &= 0x3ffffff; + t2 = _mm_cvtsi128_si32(H2) + c; + c = (t2 >> 26); + t2 &= 0x3ffffff; + t3 = _mm_cvtsi128_si32(H3) + c; + c = (t3 >> 26); + t3 &= 0x3ffffff; + t4 = _mm_cvtsi128_si32(H4) + c; + c = (t4 >> 26); + t4 &= 0x3ffffff; + t0 = t0 + (c * 5); + c = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = t1 + c; + + st->HH[0] = ((t0) | (t1 << 26)) & 0xfffffffffffull; + st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull; + st->HH[2] = ((t3 >> 10) | (t4 << 16)) & 0x3ffffffffffull; + + return consumed; +} + +void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, + size_t bytes) { + poly1305_state_internal *st = poly1305_aligned_state(state); + size_t want; + + /* need at least 32 initial bytes to start the accelerated branch */ + if (!st->started) { + if ((st->leftover == 0) && (bytes > 32)) { + poly1305_first_block(st, m); + m += 32; + bytes -= 32; + } else { + want = poly1305_min(32 - st->leftover, bytes); + poly1305_block_copy(st->buffer + st->leftover, m, want); + bytes -= want; + m += want; + st->leftover += want; + if ((st->leftover < 32) || (bytes == 0)) { + return; + } + poly1305_first_block(st, st->buffer); + st->leftover = 0; + } + st->started = 1; + } + + /* handle leftover */ + if (st->leftover) { + want = poly1305_min(64 - st->leftover, bytes); + poly1305_block_copy(st->buffer + st->leftover, m, want); + bytes -= want; + m += want; + st->leftover += want; + if (st->leftover < 64) { + return; + } + poly1305_blocks(st, st->buffer, 64); + st->leftover = 0; + } + + /* process 64 byte blocks */ + if (bytes >= 64) { + want = (bytes & ~63); + poly1305_blocks(st, m, want); + m += want; + bytes -= want; + } + + if (bytes) { + poly1305_block_copy(st->buffer + st->leftover, m, bytes); + st->leftover += bytes; + } +} + +void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { + poly1305_state_internal *st = poly1305_aligned_state(state); + size_t leftover = st->leftover; + uint8_t *m = st->buffer; + uint128_t d[3]; + uint64_t h0, h1, h2; + uint64_t t0, t1; + uint64_t g0, g1, g2, c, nc; + uint64_t r0, r1, r2, s1, s2; + poly1305_power *p; + + if (st->started) { + size_t consumed = poly1305_combine(st, m, leftover); + leftover -= consumed; + m += consumed; + } + + /* st->HH will either be 0 or have the combined result */ + h0 = st->HH[0]; + h1 = st->HH[1]; + h2 = st->HH[2]; + + p = &st->P[1]; + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + s1 = r1 * (5 << 2); + s2 = r2 * (5 << 2); + + if (leftover < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_atleast16bytes: + t0 = U8TO64_LE(m + 0); + t1 = U8TO64_LE(m + 8); + h0 += t0 & 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += t0 & 0xfffffffffff; + h2 += (t1 >> 24) | ((uint64_t)1 << 40); + +poly1305_donna_mul: + d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), + mul64x64_128(h2, s1)); + d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), + mul64x64_128(h2, s2)); + d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), + mul64x64_128(h2, r0)); + h0 = lo128(d[0]) & 0xfffffffffff; + c = shr128(d[0], 44); + d[1] = add128_64(d[1], c); + h1 = lo128(d[1]) & 0xfffffffffff; + c = shr128(d[1], 44); + d[2] = add128_64(d[2], c); + h2 = lo128(d[2]) & 0x3ffffffffff; + c = shr128(d[2], 42); + h0 += c * 5; + + m += 16; + leftover -= 16; + if (leftover >= 16) { + goto poly1305_donna_atleast16bytes; + } + +/* final bytes */ +poly1305_donna_atmost15bytes: + if (!leftover) { + goto poly1305_donna_finish; + } + + m[leftover++] = 1; + poly1305_block_zero(m + leftover, 16 - leftover); + leftover = 16; + + t0 = U8TO64_LE(m + 0); + t1 = U8TO64_LE(m + 8); + h0 += t0 & 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += t0 & 0xfffffffffff; + h2 += (t1 >> 24); + + goto poly1305_donna_mul; + +poly1305_donna_finish: + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + + g0 = h0 + 5; + c = (g0 >> 44); + g0 &= 0xfffffffffff; + g1 = h1 + c; + c = (g1 >> 44); + g1 &= 0xfffffffffff; + g2 = h2 + c - ((uint64_t)1 << 42); + + c = (g2 >> 63) - 1; + nc = ~c; + h0 = (h0 & nc) | (g0 & c); + h1 = (h1 & nc) | (g1 & c); + h2 = (h2 & nc) | (g2 & c); + + /* pad */ + t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; + t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; + h0 += (t0 & 0xfffffffffff); + c = (h0 >> 44); + h0 &= 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += (t0 & 0xfffffffffff) + c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + t1 = (t1 >> 24); + h2 += (t1)+c; + + U64TO8_LE(mac + 0, ((h0) | (h1 << 44))); + U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24))); +} + +#endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */ diff --git a/include/openssl/poly1305.h b/include/openssl/poly1305.h index d42d6ce4..b4e23e29 100644 --- a/include/openssl/poly1305.h +++ b/include/openssl/poly1305.h @@ -22,10 +22,7 @@ extern "C" { #endif -typedef union { - double align; - uint8_t bytes[512]; -} poly1305_state; +typedef uint8_t poly1305_state[512]; /* CRYPTO_poly1305_init sets up |state| so that it can be used to calculate an * authentication tag with the one-time key |key|. Note that |key| is a diff --git a/util/generate_build_files.py b/util/generate_build_files.py index e6313ec4..bc5eab9f 100644 --- a/util/generate_build_files.py +++ b/util/generate_build_files.py @@ -40,6 +40,7 @@ OS_ARCH_COMBOS = [ NON_PERL_FILES = { ('linux', 'arm'): [ 'src/crypto/curve25519/asm/x25519-asm-arm.S', + 'src/crypto/poly1305/poly1305_arm_asm.S', ], ('linux', 'x86_64'): [ 'src/crypto/curve25519/asm/x25519-asm-x86_64.S',