From 5ce12e6436c14085d9dbd86a89165b0b61ac4bd8 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Sun, 24 Feb 2019 00:49:14 -0600
Subject: [PATCH] Add a 32-bit SSSE3 GHASH implementation.

The 64-bit version can be fairly straightforwardly translated.

Ironically, this makes 32-bit x86 the first architecture to meet the
goal of constant-time AES-GCM given SIMD assembly. (Though x86_64 could
join by simply giving up on bsaes...)

Bug: 263
Change-Id: Icb2cec936457fac7132bbb5dbb094433bc14b86e
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35024
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
---
 crypto/fipsmodule/CMakeLists.txt              |   2 +
 .../fipsmodule/modes/asm/ghash-ssse3-x86.pl   | 288 ++++++++++++++++++
 crypto/fipsmodule/modes/gcm.c                 |  10 +-
 crypto/fipsmodule/modes/gcm_test.cc           |   4 +-
 crypto/fipsmodule/modes/internal.h            |  14 +-
 5 files changed, 306 insertions(+), 12 deletions(-)
 create mode 100644 crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl

diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index b459263c..04b2ffb2 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -32,6 +32,7 @@ if(${ARCH} STREQUAL "x86")
     aesni-x86.${ASM_EXT}
     bn-586.${ASM_EXT}
     co-586.${ASM_EXT}
+    ghash-ssse3-x86.${ASM_EXT}
     ghash-x86.${ASM_EXT}
     md5-586.${ASM_EXT}
     sha1-586.${ASM_EXT}
@@ -98,6 +99,7 @@ perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl)
 perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl)
 perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl)
 perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl)
+perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl)
 perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl)
 perlasm(ghash-x86.${ASM_EXT} modes/asm/ghash-x86.pl)
 perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
diff --git a/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
new file mode 100644
index 00000000..0d9ce156
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
@@ -0,0 +1,288 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit
+# table-based GHASH implementation. It requires SSSE3 instructions.
+#
+# For background, the table-based strategy is a 4-bit windowed multiplication.
+# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
+# over 4-bit windows of the input and indexes them up into the table. Visually,
+# it multiplies as in the schoolbook multiplication diagram below, but with
+# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
+# it incorporates the terms labeled '1' by indexing the most significant term
+# of X into the table. Then it shifts and repeats for '2' and so on.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        666666
+#       555555
+#      444444
+#     333333
+#    222222
+#   111111
+#
+# This implementation changes the order. We treat the table as a 16×16 matrix
+# and transpose it. The first row is then the first byte of each multiple of H,
+# and so on. We then reorder terms as below. Observe that the terms labeled '1'
+# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
+# pshufb instruction, using alternating terms of X in parallel as indices. This
+# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
+# repeat for each row.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        224466
+#       113355
+#      224466
+#     113355
+#    224466
+#   113355
+#
+# Next we account for GCM's confusing bit order. The "first" bit is the least
+# significant coefficient, but GCM treats the most sigificant bit within a byte
+# as first. Bytes are little-endian, and bits are big-endian. We reverse the
+# bytes in XMM registers for a consistent bit and byte ordering, but this means
+# the least significant bit is the most significant coefficient and vice versa.
+#
+# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
+# bit ordering within the XMM register, rather than the reversed coefficient
+# ordering. Low bits are less significant bits and more significant
+# coefficients. Right-shifts move from MSB to the LSB and correspond to
+# increasing the power of each coefficient.
+#
+# Note this bit reversal enters into the table's column indices. H*1 is stored
+# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
+# table rows contain more significant coefficients, so we iterate forwards.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open STDOUT, ">$output";
+
+&asm_init($ARGV[0]);
+
+my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx");
+&static_label("reverse_bytes");
+&static_label("low4_mask");
+
+my $call_counter = 0;
+# process_rows emits assembly code to process $rows rows of the table. On
+# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the
+# low and high halves of the input. The result so far is passed in xmm2. xmm3
+# must be zero. On output, $Htable is advanced to the next row and xmm2 is
+# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6.
+sub process_rows {
+	my ($rows) = @_;
+	$call_counter++;
+
+	# Shifting whole XMM registers by bits is complex. psrldq shifts by
+	# bytes, and psrlq shifts the two 64-bit halves separately. Each row
+	# produces 8 bits of carry, and the reduction needs an additional 7-bit
+	# shift. This must fit in 64 bits so reduction can use psrlq. This
+	# allows up to 7 rows at a time.
+	die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
+
+	&mov("eax", $rows);
+&set_label("loop_row_$call_counter");
+	&movdqa("xmm4", &QWP(0, $Htable));
+	&lea($Htable, &DWP(16, $Htable));
+
+	# Right-shift xmm2 and xmm3 by 8 bytes.
+	&movdqa("xmm6", "xmm2");
+	&palignr("xmm6", "xmm3", 1);
+	&movdqa("xmm3", "xmm6");
+	&psrldq("xmm2", 1);
+
+	# Load the next table row and index the low and high bits of the input.
+	# Note the low (respectively, high) half corresponds to more
+	# (respectively, less) significant coefficients.
+	&movdqa("xmm5", "xmm4");
+	&pshufb("xmm4", "xmm0");
+	&pshufb("xmm5", "xmm1");
+
+	# Add the high half (xmm5) without shifting.
+	&pxor("xmm2", "xmm5");
+
+	# Add the low half (xmm4). This must be right-shifted by 4 bits. First,
+	# add into the carry register (xmm3).
+	&movdqa("xmm5", "xmm4");
+	&psllq("xmm5", 60);
+	&movdqa("xmm6", "xmm5");
+	&pslldq("xmm6", 8);
+	&pxor("xmm3", "xmm6");
+
+	# Next, add into xmm2.
+	&psrldq("xmm5", 8);
+	&pxor("xmm2", "xmm5");
+	&psrlq("xmm4", 4);
+	&pxor("xmm2", "xmm4");
+
+	&sub("eax", 1);
+	&jnz(&label("loop_row_$call_counter"));
+
+	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
+	# x^7, so we shift and XOR four times.
+	&pxor("xmm2", "xmm3");	# x^0 = 0
+	&psrlq("xmm3", 1);
+	&pxor("xmm2", "xmm3");	# x^1 = x
+	&psrlq("xmm3", 1);
+	&pxor("xmm2", "xmm3");	# x^(1+1) = x^2
+	&psrlq("xmm3", 5);
+	&pxor("xmm2", "xmm3");	# x^(1+1+5) = x^7
+	&pxor("xmm3", "xmm3");
+____
+}
+
+# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
+# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
+# formatted as described above.
+# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
+&function_begin("gcm_gmult_ssse3");
+	&mov($Xi, &wparam(0));
+	&mov($Htable, &wparam(1));
+
+	&movdqu("xmm0", &QWP(0, $Xi));
+	&call(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("eax");
+	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax"));
+	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax"));
+
+	# Reverse input bytes to deserialize.
+	&pshufb("xmm0", "xmm7");
+
+	# Split each byte into low (xmm0) and high (xmm1) halves.
+	&movdqa("xmm1", "xmm2");
+	&pandn("xmm1", "xmm0");
+	&psrld("xmm1", 4);
+	&pand("xmm0", "xmm2");
+
+	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
+	# that, due to bit reversal, xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+
+	# We must reduce at least once every 7 rows, so divide into three
+	# chunks.
+	&process_rows(5);
+	&process_rows(5);
+	&process_rows(6);
+
+	# Store the result. Reverse bytes to serialize.
+	&pshufb("xmm2", "xmm7");
+	&movdqu(&QWP(0, $Xi), "xmm2");
+
+	# Zero any registers which contain secrets.
+	&pxor("xmm0", "xmm0");
+	&pxor("xmm1", "xmm1");
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+	&pxor("xmm4", "xmm4");
+	&pxor("xmm5", "xmm5");
+	&pxor("xmm6", "xmm6");
+&function_end("gcm_gmult_ssse3");
+
+# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
+# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
+# serialized byte representation. |Htable| is formatted as described above.
+# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+#                      size_t len);
+&function_begin("gcm_ghash_ssse3");
+	&mov($Xi, &wparam(0));
+	&mov($Htable, &wparam(1));
+	&mov($in, &wparam(2));
+	&mov($len, &wparam(3));
+
+	&movdqu("xmm0", &QWP(0, $Xi));
+	&call(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("ebx");
+	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx"));
+
+	# This function only processes whole blocks.
+	&and($len, -16);
+
+	# Reverse input bytes to deserialize. We maintain the running
+	# total in xmm0.
+	&pshufb("xmm0", "xmm7");
+
+	# Iterate over each block. On entry to each iteration, xmm3 is zero.
+	&pxor("xmm3", "xmm3");
+&set_label("loop_ghash");
+	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx"));
+
+	# Incorporate the next block of input.
+	&movdqu("xmm1", &QWP(0, $in));
+	&pshufb("xmm1", "xmm7");	# Reverse bytes.
+	&pxor("xmm0", "xmm1");
+
+	# Split each byte into low (xmm0) and high (xmm1) halves.
+	&movdqa("xmm1", "xmm2");
+	&pandn("xmm1", "xmm0");
+	&psrld("xmm1", 4);
+	&pand("xmm0", "xmm2");
+
+	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
+	# that, due to bit reversal, xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	&pxor("xmm2", "xmm2");
+	# xmm3 is already zero at this point.
+
+	# We must reduce at least once every 7 rows, so divide into three
+	# chunks.
+	&process_rows(5);
+	&process_rows(5);
+	&process_rows(6);
+
+	&movdqa("xmm0", "xmm2");
+
+	# Rewind $Htable for the next iteration.
+	&lea($Htable, &DWP(-256, $Htable));
+
+	# Advance input and continue.
+	&lea($in, &DWP(16, $in));
+	&sub($len, 16);
+	&jnz(&label("loop_ghash"));
+
+	# Reverse bytes and store the result.
+	&pshufb("xmm0", "xmm7");
+	&movdqu(&QWP(0, $Xi), "xmm0");
+
+	# Zero any registers which contain secrets.
+	&pxor("xmm0", "xmm0");
+	&pxor("xmm1", "xmm1");
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+	&pxor("xmm4", "xmm4");
+	&pxor("xmm5", "xmm5");
+	&pxor("xmm6", "xmm6");
+&function_end("gcm_ghash_ssse3");
+
+# reverse_bytes is a permutation which, if applied with pshufb, reverses the
+# bytes in an XMM register.
+&set_label("reverse_bytes", 16);
+&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+# low4_mask is an XMM mask which selects the low four bits of each byte.
+&set_label("low4_mask", 16);
+&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+
+&asm_finish();
+
+close STDOUT;
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index f92f6750..ca077acc 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -241,7 +241,7 @@ void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
 // still in L1 cache after encryption pass...
 #define GHASH_CHUNK (3 * 1024)
 
-#if defined(GHASH_ASM_X86_64)
+#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
 void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) {
   // Run the existing 4-bit version.
   gcm_init_4bit(Htable, Xi);
@@ -266,7 +266,7 @@ void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) {
     }
   }
 }
-#endif  // GHASH_ASM_X86_64
+#endif  // GHASH_ASM_X86_64 || GHASH_ASM_X86
 
 #ifdef GCM_FUNCREF_4BIT
 #undef GCM_MUL
@@ -321,6 +321,12 @@ void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
     *out_hash = gcm_ghash_clmul;
     return;
   }
+  if (gcm_ssse3_capable()) {
+    gcm_init_ssse3(out_table, H.u);
+    *out_mult = gcm_gmult_ssse3;
+    *out_hash = gcm_ghash_ssse3;
+    return;
+  }
 #elif defined(GHASH_ASM_ARM)
   if (gcm_pmull_capable()) {
     gcm_init_v8(out_table, H.u);
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index 47ecd29d..b2e805c1 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -148,7 +148,7 @@ TEST(GCMTest, ABI) {
   }
 #endif  // GHASH_ASM_X86
 
-#if defined(GHASH_ASM_X86_64)
+#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
   if (gcm_ssse3_capable()) {
     CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
     CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
@@ -156,9 +156,7 @@ TEST(GCMTest, ABI) {
       CHECK_ABI_SEH(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks);
     }
   }
-#endif  // GHASH_ASM_X86_64
 
-#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
     CHECK_ABI_SEH(gcm_init_clmul, Htable, kH);
     CHECK_ABI_SEH(gcm_gmult_clmul, X, Htable);
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 5f9d035b..9a081ebd 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -282,13 +282,6 @@ void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 
-#if defined(OPENSSL_X86_64)
-#define GHASH_ASM_X86_64
-void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
-                   size_t len);
-
 OPENSSL_INLINE char gcm_ssse3_capable(void) {
   return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
 }
@@ -300,6 +293,13 @@ void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
                      size_t len);
 
+#if defined(OPENSSL_X86_64)
+#define GHASH_ASM_X86_64
+void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+                   size_t len);
+
 #define AESNI_GCM
 size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi);