From 5ce12e6436c14085d9dbd86a89165b0b61ac4bd8 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Sun, 24 Feb 2019 00:49:14 -0600 Subject: [PATCH] Add a 32-bit SSSE3 GHASH implementation. The 64-bit version can be fairly straightforwardly translated. Ironically, this makes 32-bit x86 the first architecture to meet the goal of constant-time AES-GCM given SIMD assembly. (Though x86_64 could join by simply giving up on bsaes...) Bug: 263 Change-Id: Icb2cec936457fac7132bbb5dbb094433bc14b86e Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35024 Commit-Queue: David Benjamin Reviewed-by: Adam Langley --- crypto/fipsmodule/CMakeLists.txt | 2 + .../fipsmodule/modes/asm/ghash-ssse3-x86.pl | 288 ++++++++++++++++++ crypto/fipsmodule/modes/gcm.c | 10 +- crypto/fipsmodule/modes/gcm_test.cc | 4 +- crypto/fipsmodule/modes/internal.h | 14 +- 5 files changed, 306 insertions(+), 12 deletions(-) create mode 100644 crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index b459263c..04b2ffb2 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -32,6 +32,7 @@ if(${ARCH} STREQUAL "x86") aesni-x86.${ASM_EXT} bn-586.${ASM_EXT} co-586.${ASM_EXT} + ghash-ssse3-x86.${ASM_EXT} ghash-x86.${ASM_EXT} md5-586.${ASM_EXT} sha1-586.${ASM_EXT} @@ -98,6 +99,7 @@ perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl) perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl) perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl) perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl) +perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl) perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl) perlasm(ghash-x86.${ASM_EXT} modes/asm/ghash-x86.pl) perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl) diff --git a/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl new file mode 100644 index 00000000..0d9ce156 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl @@ -0,0 +1,288 @@ +#!/usr/bin/env perl +# Copyright (c) 2019, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit +# table-based GHASH implementation. It requires SSSE3 instructions. +# +# For background, the table-based strategy is a 4-bit windowed multiplication. +# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops +# over 4-bit windows of the input and indexes them up into the table. Visually, +# it multiplies as in the schoolbook multiplication diagram below, but with +# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First +# it incorporates the terms labeled '1' by indexing the most significant term +# of X into the table. Then it shifts and repeats for '2' and so on. +# +# hhhhhh +# * xxxxxx +# ============ +# 666666 +# 555555 +# 444444 +# 333333 +# 222222 +# 111111 +# +# This implementation changes the order. We treat the table as a 16×16 matrix +# and transpose it. The first row is then the first byte of each multiple of H, +# and so on. We then reorder terms as below. Observe that the terms labeled '1' +# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 +# pshufb instruction, using alternating terms of X in parallel as indices. This +# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and +# repeat for each row. +# +# hhhhhh +# * xxxxxx +# ============ +# 224466 +# 113355 +# 224466 +# 113355 +# 224466 +# 113355 +# +# Next we account for GCM's confusing bit order. The "first" bit is the least +# significant coefficient, but GCM treats the most sigificant bit within a byte +# as first. Bytes are little-endian, and bits are big-endian. We reverse the +# bytes in XMM registers for a consistent bit and byte ordering, but this means +# the least significant bit is the most significant coefficient and vice versa. +# +# For consistency, "low", "high", "left-shift", and "right-shift" refer to the +# bit ordering within the XMM register, rather than the reversed coefficient +# ordering. Low bits are less significant bits and more significant +# coefficients. Right-shifts move from MSB to the LSB and correspond to +# increasing the power of each coefficient. +# +# Note this bit reversal enters into the table's column indices. H*1 is stored +# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier +# table rows contain more significant coefficients, so we iterate forwards. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../../perlasm"); +require "x86asm.pl"; + +$output = pop; +open STDOUT, ">$output"; + +&asm_init($ARGV[0]); + +my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx"); +&static_label("reverse_bytes"); +&static_label("low4_mask"); + +my $call_counter = 0; +# process_rows emits assembly code to process $rows rows of the table. On +# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the +# low and high halves of the input. The result so far is passed in xmm2. xmm3 +# must be zero. On output, $Htable is advanced to the next row and xmm2 is +# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6. +sub process_rows { + my ($rows) = @_; + $call_counter++; + + # Shifting whole XMM registers by bits is complex. psrldq shifts by + # bytes, and psrlq shifts the two 64-bit halves separately. Each row + # produces 8 bits of carry, and the reduction needs an additional 7-bit + # shift. This must fit in 64 bits so reduction can use psrlq. This + # allows up to 7 rows at a time. + die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); + + &mov("eax", $rows); +&set_label("loop_row_$call_counter"); + &movdqa("xmm4", &QWP(0, $Htable)); + &lea($Htable, &DWP(16, $Htable)); + + # Right-shift xmm2 and xmm3 by 8 bytes. + &movdqa("xmm6", "xmm2"); + &palignr("xmm6", "xmm3", 1); + &movdqa("xmm3", "xmm6"); + &psrldq("xmm2", 1); + + # Load the next table row and index the low and high bits of the input. + # Note the low (respectively, high) half corresponds to more + # (respectively, less) significant coefficients. + &movdqa("xmm5", "xmm4"); + &pshufb("xmm4", "xmm0"); + &pshufb("xmm5", "xmm1"); + + # Add the high half (xmm5) without shifting. + &pxor("xmm2", "xmm5"); + + # Add the low half (xmm4). This must be right-shifted by 4 bits. First, + # add into the carry register (xmm3). + &movdqa("xmm5", "xmm4"); + &psllq("xmm5", 60); + &movdqa("xmm6", "xmm5"); + &pslldq("xmm6", 8); + &pxor("xmm3", "xmm6"); + + # Next, add into xmm2. + &psrldq("xmm5", 8); + &pxor("xmm2", "xmm5"); + &psrlq("xmm4", 4); + &pxor("xmm2", "xmm4"); + + &sub("eax", 1); + &jnz(&label("loop_row_$call_counter")); + + # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + + # x^7, so we shift and XOR four times. + &pxor("xmm2", "xmm3"); # x^0 = 0 + &psrlq("xmm3", 1); + &pxor("xmm2", "xmm3"); # x^1 = x + &psrlq("xmm3", 1); + &pxor("xmm2", "xmm3"); # x^(1+1) = x^2 + &psrlq("xmm3", 5); + &pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7 + &pxor("xmm3", "xmm3"); +____ +} + +# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. +# |Xi| is represented in GHASH's serialized byte representation. |Htable| is +# formatted as described above. +# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); +&function_begin("gcm_gmult_ssse3"); + &mov($Xi, &wparam(0)); + &mov($Htable, &wparam(1)); + + &movdqu("xmm0", &QWP(0, $Xi)); + &call(&label("pic_point")); +&set_label("pic_point"); + &blindpop("eax"); + &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax")); + &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax")); + + # Reverse input bytes to deserialize. + &pshufb("xmm0", "xmm7"); + + # Split each byte into low (xmm0) and high (xmm1) halves. + &movdqa("xmm1", "xmm2"); + &pandn("xmm1", "xmm0"); + &psrld("xmm1", 4); + &pand("xmm0", "xmm2"); + + # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note + # that, due to bit reversal, xmm3 contains bits that fall off when + # right-shifting, not left-shifting. + &pxor("xmm2", "xmm2"); + &pxor("xmm3", "xmm3"); + + # We must reduce at least once every 7 rows, so divide into three + # chunks. + &process_rows(5); + &process_rows(5); + &process_rows(6); + + # Store the result. Reverse bytes to serialize. + &pshufb("xmm2", "xmm7"); + &movdqu(&QWP(0, $Xi), "xmm2"); + + # Zero any registers which contain secrets. + &pxor("xmm0", "xmm0"); + &pxor("xmm1", "xmm1"); + &pxor("xmm2", "xmm2"); + &pxor("xmm3", "xmm3"); + &pxor("xmm4", "xmm4"); + &pxor("xmm5", "xmm5"); + &pxor("xmm6", "xmm6"); +&function_end("gcm_gmult_ssse3"); + +# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as +# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's +# serialized byte representation. |Htable| is formatted as described above. +# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, +# size_t len); +&function_begin("gcm_ghash_ssse3"); + &mov($Xi, &wparam(0)); + &mov($Htable, &wparam(1)); + &mov($in, &wparam(2)); + &mov($len, &wparam(3)); + + &movdqu("xmm0", &QWP(0, $Xi)); + &call(&label("pic_point")); +&set_label("pic_point"); + &blindpop("ebx"); + &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx")); + + # This function only processes whole blocks. + &and($len, -16); + + # Reverse input bytes to deserialize. We maintain the running + # total in xmm0. + &pshufb("xmm0", "xmm7"); + + # Iterate over each block. On entry to each iteration, xmm3 is zero. + &pxor("xmm3", "xmm3"); +&set_label("loop_ghash"); + &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx")); + + # Incorporate the next block of input. + &movdqu("xmm1", &QWP(0, $in)); + &pshufb("xmm1", "xmm7"); # Reverse bytes. + &pxor("xmm0", "xmm1"); + + # Split each byte into low (xmm0) and high (xmm1) halves. + &movdqa("xmm1", "xmm2"); + &pandn("xmm1", "xmm0"); + &psrld("xmm1", 4); + &pand("xmm0", "xmm2"); + + # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note + # that, due to bit reversal, xmm3 contains bits that fall off when + # right-shifting, not left-shifting. + &pxor("xmm2", "xmm2"); + # xmm3 is already zero at this point. + + # We must reduce at least once every 7 rows, so divide into three + # chunks. + &process_rows(5); + &process_rows(5); + &process_rows(6); + + &movdqa("xmm0", "xmm2"); + + # Rewind $Htable for the next iteration. + &lea($Htable, &DWP(-256, $Htable)); + + # Advance input and continue. + &lea($in, &DWP(16, $in)); + &sub($len, 16); + &jnz(&label("loop_ghash")); + + # Reverse bytes and store the result. + &pshufb("xmm0", "xmm7"); + &movdqu(&QWP(0, $Xi), "xmm0"); + + # Zero any registers which contain secrets. + &pxor("xmm0", "xmm0"); + &pxor("xmm1", "xmm1"); + &pxor("xmm2", "xmm2"); + &pxor("xmm3", "xmm3"); + &pxor("xmm4", "xmm4"); + &pxor("xmm5", "xmm5"); + &pxor("xmm6", "xmm6"); +&function_end("gcm_ghash_ssse3"); + +# reverse_bytes is a permutation which, if applied with pshufb, reverses the +# bytes in an XMM register. +&set_label("reverse_bytes", 16); +&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +# low4_mask is an XMM mask which selects the low four bits of each byte. +&set_label("low4_mask", 16); +&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); + +&asm_finish(); + +close STDOUT; diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c index f92f6750..ca077acc 100644 --- a/crypto/fipsmodule/modes/gcm.c +++ b/crypto/fipsmodule/modes/gcm.c @@ -241,7 +241,7 @@ void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, // still in L1 cache after encryption pass... #define GHASH_CHUNK (3 * 1024) -#if defined(GHASH_ASM_X86_64) +#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86) void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) { // Run the existing 4-bit version. gcm_init_4bit(Htable, Xi); @@ -266,7 +266,7 @@ void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) { } } } -#endif // GHASH_ASM_X86_64 +#endif // GHASH_ASM_X86_64 || GHASH_ASM_X86 #ifdef GCM_FUNCREF_4BIT #undef GCM_MUL @@ -321,6 +321,12 @@ void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, *out_hash = gcm_ghash_clmul; return; } + if (gcm_ssse3_capable()) { + gcm_init_ssse3(out_table, H.u); + *out_mult = gcm_gmult_ssse3; + *out_hash = gcm_ghash_ssse3; + return; + } #elif defined(GHASH_ASM_ARM) if (gcm_pmull_capable()) { gcm_init_v8(out_table, H.u); diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc index 47ecd29d..b2e805c1 100644 --- a/crypto/fipsmodule/modes/gcm_test.cc +++ b/crypto/fipsmodule/modes/gcm_test.cc @@ -148,7 +148,7 @@ TEST(GCMTest, ABI) { } #endif // GHASH_ASM_X86 -#if defined(GHASH_ASM_X86_64) +#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64) if (gcm_ssse3_capable()) { CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH); CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable); @@ -156,9 +156,7 @@ TEST(GCMTest, ABI) { CHECK_ABI_SEH(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks); } } -#endif // GHASH_ASM_X86_64 -#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64) if (crypto_gcm_clmul_enabled()) { CHECK_ABI_SEH(gcm_init_clmul, Htable, kH); CHECK_ABI_SEH(gcm_gmult_clmul, X, Htable); diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h index 5f9d035b..9a081ebd 100644 --- a/crypto/fipsmodule/modes/internal.h +++ b/crypto/fipsmodule/modes/internal.h @@ -282,13 +282,6 @@ void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len); -#if defined(OPENSSL_X86_64) -#define GHASH_ASM_X86_64 -void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, - size_t len); - OPENSSL_INLINE char gcm_ssse3_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; } @@ -300,6 +293,13 @@ void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, size_t len); +#if defined(OPENSSL_X86_64) +#define GHASH_ASM_X86_64 +void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, + size_t len); + #define AESNI_GCM size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi);