diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 463febbf..b459263c 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -8,6 +8,7 @@ if(${ARCH} STREQUAL "x86_64") aesni-x86_64.${ASM_EXT} aes-x86_64.${ASM_EXT} bsaes-x86_64.${ASM_EXT} + ghash-ssse3-x86_64.${ASM_EXT} ghash-x86_64.${ASM_EXT} md5-x86_64.${ASM_EXT} p256-x86_64-asm.${ASM_EXT} @@ -96,6 +97,7 @@ perlasm(co-586.${ASM_EXT} bn/asm/co-586.pl) perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl) perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl) perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl) +perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl) perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl) perlasm(ghash-x86.${ASM_EXT} modes/asm/ghash-x86.pl) perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl) diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c index f7c145b9..81c74cbf 100644 --- a/crypto/fipsmodule/cipher/e_aes.c +++ b/crypto/fipsmodule/cipher/e_aes.c @@ -46,6 +46,7 @@ * OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== */ +#include #include #include @@ -84,13 +85,13 @@ typedef struct { } EVP_AES_KEY; typedef struct { + GCM128_CONTEXT gcm; union { double align; AES_KEY ks; } ks; // AES key schedule to use int key_set; // Set if key initialised int iv_set; // Set if an iv is set - GCM128_CONTEXT gcm; uint8_t *iv; // Temporary IV store int ivlen; // IV length int taglen; @@ -257,9 +258,37 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, return NULL; } +#if defined(OPENSSL_32_BIT) +#define EVP_AES_GCM_CTX_PADDING (4+8) +#else +#define EVP_AES_GCM_CTX_PADDING 8 +#endif + +static EVP_AES_GCM_CTX *aes_gcm_from_cipher_ctx(EVP_CIPHER_CTX *ctx) { +#if defined(__GNUC__) || defined(__clang__) + OPENSSL_STATIC_ASSERT( + alignof(EVP_AES_GCM_CTX) <= 16, + "EVP_AES_GCM_CTX needs more alignment than this function provides"); +#endif + + // |malloc| guarantees up to 4-byte alignment on 32-bit and 8-byte alignment + // on 64-bit systems, so we need to adjust to reach 16-byte alignment. + assert(ctx->cipher->ctx_size == + sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING); + + char *ptr = ctx->cipher_data; +#if defined(OPENSSL_32_BIT) + assert((uintptr_t)ptr % 4 == 0); + ptr += (uintptr_t)ptr & 4; +#endif + assert((uintptr_t)ptr % 8 == 0); + ptr += (uintptr_t)ptr & 8; + return (EVP_AES_GCM_CTX *)ptr; +} + static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, const uint8_t *iv, int enc) { - EVP_AES_GCM_CTX *gctx = ctx->cipher_data; + EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx); if (!iv && !key) { return 1; } @@ -290,7 +319,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, } static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) { - EVP_AES_GCM_CTX *gctx = c->cipher_data; + EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c); OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm)); if (gctx->iv != c->iv) { OPENSSL_free(gctx->iv); @@ -314,7 +343,7 @@ static void ctr64_inc(uint8_t *counter) { } static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { - EVP_AES_GCM_CTX *gctx = c->cipher_data; + EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c); switch (type) { case EVP_CTRL_INIT: gctx->key_set = 0; @@ -406,7 +435,7 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { case EVP_CTRL_COPY: { EVP_CIPHER_CTX *out = ptr; - EVP_AES_GCM_CTX *gctx_out = out->cipher_data; + EVP_AES_GCM_CTX *gctx_out = aes_gcm_from_cipher_ctx(out); if (gctx->iv == c->iv) { gctx_out->iv = out->iv; } else { @@ -426,7 +455,7 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, size_t len) { - EVP_AES_GCM_CTX *gctx = ctx->cipher_data; + EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx); // If not set up, return error if (!gctx->key_set) { @@ -540,7 +569,7 @@ DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_gcm_generic) { out->block_size = 1; out->key_len = 16; out->iv_len = 12; - out->ctx_size = sizeof(EVP_AES_GCM_CTX); + out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING; out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; @@ -608,7 +637,7 @@ DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_gcm_generic) { out->block_size = 1; out->key_len = 24; out->iv_len = 12; - out->ctx_size = sizeof(EVP_AES_GCM_CTX); + out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING; out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; @@ -676,7 +705,7 @@ DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_gcm_generic) { out->block_size = 1; out->key_len = 32; out->iv_len = 12; - out->ctx_size = sizeof(EVP_AES_GCM_CTX); + out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING; out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; diff --git a/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl new file mode 100644 index 00000000..830381b1 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl @@ -0,0 +1,413 @@ +#!/usr/bin/env perl +# Copyright (c) 2019, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit +# table-based GHASH implementation. It requires SSSE3 instructions. +# +# For background, the table-based strategy is a 4-bit windowed multiplication. +# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops +# over 4-bit windows of the input and indexes them up into the table. Visually, +# it multiplies as in the schoolbook multiplication diagram below, but with +# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First +# it incorporates the terms labeled '1' by indexing the most significant term +# of X into the table. Then it shifts and repeats for '2' and so on. +# +# hhhhhh +# * xxxxxx +# ============ +# 666666 +# 555555 +# 444444 +# 333333 +# 222222 +# 111111 +# +# This implementation changes the order. We treat the table as a 16×16 matrix +# and transpose it. The first row is then the first byte of each multiple of H, +# and so on. We then reorder terms as below. Observe that the terms labeled '1' +# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 +# pshufb instruction, using alternating terms of X in parallel as indices. This +# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and +# repeat for each row. +# +# hhhhhh +# * xxxxxx +# ============ +# 224466 +# 113355 +# 224466 +# 113355 +# 224466 +# 113355 +# +# Next we account for GCM's confusing bit order. The "first" bit is the least +# significant coefficient, but GCM treats the most sigificant bit within a byte +# as first. Bytes are little-endian, and bits are big-endian. We reverse the +# bytes in XMM registers for a consistent bit and byte ordering, but this means +# the least significant bit is the most significant coefficient and vice versa. +# +# For consistency, "low", "high", "left-shift", and "right-shift" refer to the +# bit ordering within the XMM register, rather than the reversed coefficient +# ordering. Low bits are less significant bits and more significant +# coefficients. Right-shifts move from MSB to the LSB and correspond to +# increasing the power of each coefficient. +# +# Note this bit reversal enters into the table's column indices. H*1 is stored +# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier +# table rows contain more significant coefficients, so we iterate forwards. + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +my $win64 = 0; +$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir = $1; +my $xlate; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT = *OUT; + +my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") : + ("%rdi", "%rsi", "%rdx", "%rcx"); + + +my $code = <<____; +.text + +# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. +# |Xi| is represented in GHASH's serialized byte representation. |Htable| is +# formatted as described above. +# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); +.type gcm_gmult_ssse3, \@abi-omnipotent +.globl gcm_gmult_ssse3 +.align 16 +gcm_gmult_ssse3: +.cfi_startproc +.Lgmult_seh_begin: +____ +$code .= <<____ if ($win64); + subq \$40, %rsp +.Lgmult_seh_allocstack: + movdqa %xmm6, (%rsp) +.Lgmult_seh_save_xmm6: + movdqa %xmm10, 16(%rsp) +.Lgmult_seh_save_xmm10: +.Lgmult_seh_prolog_end: +____ +$code .= <<____; + movdqu ($Xi), %xmm0 + movdqa .Lreverse_bytes(%rip), %xmm10 + movdqa .Llow4_mask(%rip), %xmm2 + + # Reverse input bytes to deserialize. + pshufb %xmm10, %xmm0 + + # Split each byte into low (%xmm0) and high (%xmm1) halves. + movdqa %xmm2, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 + pand %xmm2, %xmm0 + + # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note + # that, due to bit reversal, %xmm3 contains bits that fall off when + # right-shifting, not left-shifting. + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 +____ + +my $call_counter = 0; +# process_rows returns assembly code to process $rows rows of the table. On +# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the +# low and high halves of the input. The result so far is passed in %xmm2. %xmm3 +# must be zero. On output, $Htable is advanced to the next row and %xmm2 is +# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6. +sub process_rows { + my ($rows) = @_; + $call_counter++; + + # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes, + # and psrlq shifts the two 64-bit halves separately. Each row produces 8 + # bits of carry, and the reduction needs an additional 7-bit shift. This + # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows + # at a time. + die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); + + return <<____; + movq \$$rows, %rax +.Loop_row_$call_counter: + movdqa ($Htable), %xmm4 + leaq 16($Htable), $Htable + + # Right-shift %xmm2 and %xmm3 by 8 bytes. + movdqa %xmm2, %xmm6 + palignr \$1, %xmm3, %xmm6 + movdqa %xmm6, %xmm3 + psrldq \$1, %xmm2 + + # Load the next table row and index the low and high bits of the input. + # Note the low (respectively, high) half corresponds to more + # (respectively, less) significant coefficients. + movdqa %xmm4, %xmm5 + pshufb %xmm0, %xmm4 + pshufb %xmm1, %xmm5 + + # Add the high half (%xmm5) without shifting. + pxor %xmm5, %xmm2 + + # Add the low half (%xmm4). This must be right-shifted by 4 bits. First, + # add into the carry register (%xmm3). + movdqa %xmm4, %xmm5 + psllq \$60, %xmm5 + movdqa %xmm5, %xmm6 + pslldq \$8, %xmm6 + pxor %xmm6, %xmm3 + + # Next, add into %xmm2. + psrldq \$8, %xmm5 + pxor %xmm5, %xmm2 + psrlq \$4, %xmm4 + pxor %xmm4, %xmm2 + + subq \$1, %rax + jnz .Loop_row_$call_counter + + # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + + # x^7, so we shift and XOR four times. + pxor %xmm3, %xmm2 # x^0 = 0 + psrlq \$1, %xmm3 + pxor %xmm3, %xmm2 # x^1 = x + psrlq \$1, %xmm3 + pxor %xmm3, %xmm2 # x^(1+1) = x^2 + psrlq \$5, %xmm3 + pxor %xmm3, %xmm2 # x^(1+1+5) = x^7 + pxor %xmm3, %xmm3 +____ +} + +# We must reduce at least once every 7 rows, so divide into three chunks. +$code .= process_rows(5); +$code .= process_rows(5); +$code .= process_rows(6); + +$code .= <<____; + # Store the result. Reverse bytes to serialize. + pshufb %xmm10, %xmm2 + movdqu %xmm2, ($Xi) + + # Zero any registers which contain secrets. + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 +____ +$code .= <<____ if ($win64); + movdqa (%rsp), %xmm6 + movdqa 16(%rsp), %xmm10 + addq \$40, %rsp +____ +$code .= <<____; + ret +.Lgmult_seh_end: +.cfi_endproc +.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 +____ + +$code .= <<____; +# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as +# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's +# serialized byte representation. |Htable| is formatted as described above. +# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, +# size_t len); +.type gcm_ghash_ssse3, \@abi-omnipotent +.globl gcm_ghash_ssse3 +.align 16 +gcm_ghash_ssse3: +.Lghash_seh_begin: +.cfi_startproc +____ +$code .= <<____ if ($win64); + subq \$56, %rsp +.Lghash_seh_allocstack: + movdqa %xmm6, (%rsp) +.Lghash_seh_save_xmm6: + movdqa %xmm10, 16(%rsp) +.Lghash_seh_save_xmm10: + movdqa %xmm11, 32(%rsp) +.Lghash_seh_save_xmm11: +.Lghash_seh_prolog_end: +____ +$code .= <<____; + movdqu ($Xi), %xmm0 + movdqa .Lreverse_bytes(%rip), %xmm10 + movdqa .Llow4_mask(%rip), %xmm11 + + # This function only processes whole blocks. + andq \$-16, $len + + # Reverse input bytes to deserialize. We maintain the running + # total in %xmm0. + pshufb %xmm10, %xmm0 + + # Iterate over each block. On entry to each iteration, %xmm3 is zero. + pxor %xmm3, %xmm3 +.Loop_ghash: + # Incorporate the next block of input. + movdqu ($in), %xmm1 + pshufb %xmm10, %xmm1 # Reverse bytes. + pxor %xmm1, %xmm0 + + # Split each byte into low (%xmm0) and high (%xmm1) halves. + movdqa %xmm11, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 + pand %xmm11, %xmm0 + + # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note + # that, due to bit reversal, %xmm3 contains bits that fall off when + # right-shifting, not left-shifting. + pxor %xmm2, %xmm2 + # %xmm3 is already zero at this point. +____ + +# We must reduce at least once every 7 rows, so divide into three chunks. +$code .= process_rows(5); +$code .= process_rows(5); +$code .= process_rows(6); + +$code .= <<____; + movdqa %xmm2, %xmm0 + + # Rewind $Htable for the next iteration. + leaq -256($Htable), $Htable + + # Advance input and continue. + leaq 16($in), $in + subq \$16, $len + jnz .Loop_ghash + + # Reverse bytes and store the result. + pshufb %xmm10, %xmm0 + movdqu %xmm0, ($Xi) + + # Zero any registers which contain secrets. + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 +____ +$code .= <<____ if ($win64); + movdqa (%rsp), %xmm6 + movdqa 16(%rsp), %xmm10 + movdqa 32(%rsp), %xmm11 + addq \$56, %rsp +____ +$code .= <<____; + ret +.Lghash_seh_end: +.cfi_endproc +.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 + +.align 16 +# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the +# bytes in an XMM register. +.Lreverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +# .Llow4_mask is an XMM mask which selects the low four bits of each byte. +.Llow4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +____ + +if ($win64) { + # Add unwind metadata for SEH. + # + # TODO(davidben): This is all manual right now. Once we've added SEH tests, + # add support for emitting these in x86_64-xlate.pl, probably based on MASM + # and Yasm's unwind directives, and unify with CFI. Then upstream it to + # replace the error-prone and non-standard custom handlers. + + # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017 + my $UWOP_ALLOC_SMALL = 2; + my $UWOP_SAVE_XMM128 = 8; + + $code .= <<____; +.section .pdata +.align 4 + .rva .Lgmult_seh_begin + .rva .Lgmult_seh_end + .rva .Lgmult_seh_info + + .rva .Lghash_seh_begin + .rva .Lghash_seh_end + .rva .Lghash_seh_info + +.section .xdata +.align 8 +.Lgmult_seh_info: + .byte 1 # version 1, no flags + .byte .Lgmult_seh_prolog_end-.Lgmult_seh_begin + .byte 5 # num_slots = 1 + 2 + 2 + .byte 0 # no frame register + + .byte .Lgmult_seh_allocstack-.Lgmult_seh_begin + .byte @{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]} + + .byte .Lgmult_seh_save_xmm6-.Lgmult_seh_begin + .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]} + .value 0 + + .byte .Lgmult_seh_save_xmm10-.Lgmult_seh_begin + .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]} + .value 1 + +.align 8 +.Lghash_seh_info: + .byte 1 # version 1, no flags + .byte .Lghash_seh_prolog_end-.Lghash_seh_begin + .byte 7 # num_slots = 1 + 2 + 2 + 2 + .byte 0 # no frame register + + .byte .Lghash_seh_allocstack-.Lghash_seh_begin + .byte @{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]} + + .byte .Lghash_seh_save_xmm6-.Lghash_seh_begin + .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]} + .value 0 + + .byte .Lghash_seh_save_xmm10-.Lghash_seh_begin + .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]} + .value 1 + + .byte .Lghash_seh_save_xmm11-.Lghash_seh_begin + .byte @{[$UWOP_SAVE_XMM128 | (11 << 4)]} + .value 2 +____ +} + +print $code; +close STDOUT; diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c index 2a450cd5..681f7a91 100644 --- a/crypto/fipsmodule/modes/gcm.c +++ b/crypto/fipsmodule/modes/gcm.c @@ -243,6 +243,33 @@ void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, #define GHASH_CHUNK (3 * 1024) #endif // GHASH_ASM +#if defined(GHASH_ASM_X86_64) +void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) { + // Run the existing 4-bit version. + gcm_init_4bit(Htable, Xi); + + // First, swap hi and lo. The "4bit" version places hi first. It treats the + // two fields separately, so the order does not matter, but ghash-ssse3 reads + // the entire state into one 128-bit register. + for (int i = 0; i < 16; i++) { + uint64_t tmp = Htable[i].hi; + Htable[i].hi = Htable[i].lo; + Htable[i].lo = tmp; + } + + // Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i] + // contains the i'th byte of j*H for all j. + uint8_t *Hbytes = (uint8_t *)Htable; + for (int i = 0; i < 16; i++) { + for (int j = 0; j < i; j++) { + uint8_t tmp = Hbytes[16*i + j]; + Hbytes[16*i + j] = Hbytes[16*j + i]; + Hbytes[16*j + i] = tmp; + } + } +} +#endif // GHASH_ASM_X86_64 + #ifdef GCM_FUNCREF_4BIT #undef GCM_MUL #define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable) @@ -285,6 +312,12 @@ void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, *out_hash = gcm_ghash_clmul; return; } + if (gcm_ssse3_capable()) { + gcm_init_ssse3(out_table, H.u); + *out_mult = gcm_gmult_ssse3; + *out_hash = gcm_ghash_ssse3; + return; + } #elif defined(GHASH_ASM_X86) if (crypto_gcm_clmul_enabled()) { gcm_init_clmul(out_table, H.u); diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc index fb17bbbf..54827ca2 100644 --- a/crypto/fipsmodule/modes/gcm_test.cc +++ b/crypto/fipsmodule/modes/gcm_test.cc @@ -61,6 +61,12 @@ #include "../../test/file_test.h" #include "../../test/test_util.h" +#if defined(OPENSSL_WINDOWS) +OPENSSL_MSVC_PRAGMA(warning(push, 3)) +#include +OPENSSL_MSVC_PRAGMA(warning(pop)) +#endif + TEST(GCMTest, TestVectors) { FileTestGTest("crypto/fipsmodule/modes/gcm_tests.txt", [](FileTest *t) { @@ -133,13 +139,21 @@ TEST(GCMTest, ABI) { UINT64_C(0xf328c2b971b2fe78), }; - u128 Htable[16]; + alignas(16) u128 Htable[16]; CHECK_ABI(gcm_init_4bit, Htable, kH); CHECK_ABI(gcm_gmult_4bit, X, Htable); for (size_t blocks : kBlockCounts) { CHECK_ABI(gcm_ghash_4bit, X, Htable, buf, 16 * blocks); } + if (gcm_ssse3_capable()) { + CHECK_ABI(gcm_init_ssse3, Htable, kH); + CHECK_ABI(gcm_gmult_ssse3, X, Htable); + for (size_t blocks : kBlockCounts) { + CHECK_ABI(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks); + } + } + if (crypto_gcm_clmul_enabled()) { CHECK_ABI(gcm_init_clmul, Htable, kH); CHECK_ABI(gcm_gmult_clmul, X, Htable); @@ -156,4 +170,38 @@ TEST(GCMTest, ABI) { } } } + +#if defined(OPENSSL_WINDOWS) +// Sanity-check the SEH unwind codes in ghash-ssse3-x86_64.pl. +// TODO(davidben): Implement unwind testing for SEH and remove this. +static void GCMSSSE3ExceptionTest() { + if (!gcm_ssse3_capable()) { + return; + } + + bool handled = false; + __try { + gcm_gmult_ssse3(nullptr, nullptr); + } __except (GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION + ? EXCEPTION_EXECUTE_HANDLER + : EXCEPTION_CONTINUE_SEARCH) { + handled = true; + } + EXPECT_TRUE(handled); + + handled = false; + __try { + gcm_ghash_ssse3(nullptr, nullptr, nullptr, 16); + } __except (GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION + ? EXCEPTION_EXECUTE_HANDLER + : EXCEPTION_CONTINUE_SEARCH) { + handled = true; + } + EXPECT_TRUE(handled); +} + +TEST(GCMTest, SEH) { + CHECK_ABI_NO_UNWIND(GCMSSSE3ExceptionTest); +} +#endif // OPENSSL_WINDOWS #endif // GHASH_ASM_X86_64 && SUPPORTS_ABI_TEST diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h index 3163c502..79a09514 100644 --- a/crypto/fipsmodule/modes/internal.h +++ b/crypto/fipsmodule/modes/internal.h @@ -159,7 +159,10 @@ typedef void (*ghash_func)(uint64_t Xi[2], const u128 Htable[16], typedef struct gcm128_key_st { // Note the MOVBE-based, x86-64, GHASH assembly requires |H| and |Htable| to - // be the first two elements of this struct. + // be the first two elements of this struct. Additionally, some assembly + // routines require a 16-byte-aligned |Htable| when hashing data, but not + // initialization. |GCM128_KEY| is not itself aligned to simplify embedding in + // |EVP_AEAD_CTX|, but |Htable|'s offset must be a multiple of 16. u128 H; u128 Htable[16]; gmult_func gmult; @@ -184,8 +187,10 @@ typedef struct { } Yi, EKi, EK0, len, Xi; // Note that the order of |Xi| and |gcm_key| is fixed by the MOVBE-based, - // x86-64, GHASH assembly. - GCM128_KEY gcm_key; + // x86-64, GHASH assembly. Additionally, some assembly routines require + // |gcm_key| to be 16-byte aligned. |GCM128_KEY| is not itself aligned to + // simplify embedding in |EVP_AEAD_CTX|. + alignas(16) GCM128_KEY gcm_key; unsigned mres, ares; } GCM128_CONTEXT; @@ -295,6 +300,18 @@ void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, size_t len); + +OPENSSL_INLINE char gcm_ssse3_capable(void) { + return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; +} + +// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be +// 16-byte-aligned, but |gcm_init_ssse3| does not. +void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, + size_t len); + #define AESNI_GCM size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi); @@ -472,10 +489,11 @@ typedef union { struct polyval_ctx { // Note that the order of |S|, |H| and |Htable| is fixed by the MOVBE-based, - // x86-64, GHASH assembly. + // x86-64, GHASH assembly. Additionally, some assembly routines require + // |Htable| to be 16-byte aligned. polyval_block S; u128 H; - u128 Htable[16]; + alignas(16) u128 Htable[16]; gmult_func gmult; ghash_func ghash; };