boringssl/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl

#!/usr/bin/env perl
# Copyright (c) 2019, Google Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
# table-based GHASH implementation. It requires SSSE3 instructions.
#
# For background, the table-based strategy is a 4-bit windowed multiplication.
# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
# over 4-bit windows of the input and indexes them up into the table. Visually,
# it multiplies as in the schoolbook multiplication diagram below, but with
# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
# it incorporates the terms labeled '1' by indexing the most significant term
# of X into the table. Then it shifts and repeats for '2' and so on.
#
#        hhhhhh
#  *     xxxxxx
#  ============
#        666666
#       555555
#      444444
#     333333
#    222222
#   111111
#
# This implementation changes the order. We treat the table as a 16×16 matrix
# and transpose it. The first row is then the first byte of each multiple of H,
# and so on. We then reorder terms as below. Observe that the terms labeled '1'
# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
# pshufb instruction, using alternating terms of X in parallel as indices. This
# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
# repeat for each row.
#
#        hhhhhh
#  *     xxxxxx
#  ============
#        224466
#       113355
#      224466
#     113355
#    224466
#   113355
#
# Next we account for GCM's confusing bit order. The "first" bit is the least
# significant coefficient, but GCM treats the most sigificant bit within a byte
# as first. Bytes are little-endian, and bits are big-endian. We reverse the
# bytes in XMM registers for a consistent bit and byte ordering, but this means
# the least significant bit is the most significant coefficient and vice versa.
#
# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
# bit ordering within the XMM register, rather than the reversed coefficient
# ordering. Low bits are less significant bits and more significant
# coefficients. Right-shifts move from MSB to the LSB and correspond to
# increasing the power of each coefficient.
#
# Note this bit reversal enters into the table's column indices. H*1 is stored
# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
# table rows contain more significant coefficients, so we iterate forwards.

use strict;

my $flavour = shift;
my $output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

my $win64 = 0;
$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT = *OUT;

my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
                                        ("%rdi", "%rsi", "%rdx", "%rcx");


my $code = <<____;
.text

# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
# formatted as described above.
# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
.type	gcm_gmult_ssse3, \@abi-omnipotent
.globl	gcm_gmult_ssse3
.align	16
gcm_gmult_ssse3:
.cfi_startproc
.Lgmult_seh_begin:
____
$code .= <<____ if ($win64);
	subq	\$40, %rsp
.Lgmult_seh_allocstack:
	movdqa	%xmm6, (%rsp)
.Lgmult_seh_save_xmm6:
	movdqa	%xmm10, 16(%rsp)
.Lgmult_seh_save_xmm10:
.Lgmult_seh_prolog_end:
____
$code .= <<____;
	movdqu	($Xi), %xmm0
	movdqa	.Lreverse_bytes(%rip), %xmm10
	movdqa	.Llow4_mask(%rip), %xmm2

	# Reverse input bytes to deserialize.
	pshufb	%xmm10, %xmm0

	# Split each byte into low (%xmm0) and high (%xmm1) halves.
	movdqa	%xmm2, %xmm1
	pandn	%xmm0, %xmm1
	psrld	\$4, %xmm1
	pand	%xmm2, %xmm0

	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
	# that, due to bit reversal, %xmm3 contains bits that fall off when
	# right-shifting, not left-shifting.
	pxor	%xmm2, %xmm2
	pxor	%xmm3, %xmm3
____

my $call_counter = 0;
# process_rows returns assembly code to process $rows rows of the table. On
# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
sub process_rows {
    my ($rows) = @_;
    $call_counter++;

    # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
    # and psrlq shifts the two 64-bit halves separately. Each row produces 8
    # bits of carry, and the reduction needs an additional 7-bit shift. This
    # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
    # at a time.
    die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);

    return <<____;
	movq	\$$rows, %rax
.Loop_row_$call_counter:
	movdqa	($Htable), %xmm4
	leaq	16($Htable), $Htable

	# Right-shift %xmm2 and %xmm3 by 8 bytes.
	movdqa	%xmm2, %xmm6
	palignr	\$1, %xmm3, %xmm6
	movdqa	%xmm6, %xmm3
	psrldq	\$1, %xmm2

	# Load the next table row and index the low and high bits of the input.
	# Note the low (respectively, high) half corresponds to more
	# (respectively, less) significant coefficients.
	movdqa	%xmm4, %xmm5
	pshufb	%xmm0, %xmm4
	pshufb	%xmm1, %xmm5

	# Add the high half (%xmm5) without shifting.
	pxor	%xmm5, %xmm2

	# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
	# add into the carry register (%xmm3).
	movdqa	%xmm4, %xmm5
	psllq	\$60, %xmm5
	movdqa	%xmm5, %xmm6
	pslldq	\$8, %xmm6
	pxor	%xmm6, %xmm3

	# Next, add into %xmm2.
	psrldq	\$8, %xmm5
	pxor	%xmm5, %xmm2
	psrlq	\$4, %xmm4
	pxor	%xmm4, %xmm2

	subq	\$1, %rax
	jnz	.Loop_row_$call_counter

	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
	# x^7, so we shift and XOR four times.
	pxor	%xmm3, %xmm2	# x^0 = 0
	psrlq	\$1, %xmm3
	pxor	%xmm3, %xmm2	# x^1 = x
	psrlq	\$1, %xmm3
	pxor	%xmm3, %xmm2	# x^(1+1) = x^2
	psrlq	\$5, %xmm3
	pxor	%xmm3, %xmm2	# x^(1+1+5) = x^7
	pxor	%xmm3, %xmm3
____
}

# We must reduce at least once every 7 rows, so divide into three chunks.
$code .= process_rows(5);
$code .= process_rows(5);
$code .= process_rows(6);

$code .= <<____;
	# Store the result. Reverse bytes to serialize.
	pshufb	%xmm10, %xmm2
	movdqu	%xmm2, ($Xi)

	# Zero any registers which contain secrets.
	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1
	pxor	%xmm2, %xmm2
	pxor	%xmm3, %xmm3
	pxor	%xmm4, %xmm4
	pxor	%xmm5, %xmm5
	pxor	%xmm6, %xmm6
____
$code .= <<____ if ($win64);
	movdqa	(%rsp), %xmm6
	movdqa	16(%rsp), %xmm10
	addq	\$40, %rsp
____
$code .= <<____;
	ret
.Lgmult_seh_end:
.cfi_endproc
.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
____

$code .= <<____;
# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
# serialized byte representation. |Htable| is formatted as described above.
# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
#                      size_t len);
.type	gcm_ghash_ssse3, \@abi-omnipotent
.globl	gcm_ghash_ssse3
.align	16
gcm_ghash_ssse3:
.Lghash_seh_begin:
.cfi_startproc
____
$code .= <<____ if ($win64);
	subq	\$56, %rsp
.Lghash_seh_allocstack:
	movdqa	%xmm6, (%rsp)
.Lghash_seh_save_xmm6:
	movdqa	%xmm10, 16(%rsp)
.Lghash_seh_save_xmm10:
	movdqa	%xmm11, 32(%rsp)
.Lghash_seh_save_xmm11:
.Lghash_seh_prolog_end:
____
$code .= <<____;
	movdqu	($Xi), %xmm0
	movdqa	.Lreverse_bytes(%rip), %xmm10
	movdqa	.Llow4_mask(%rip), %xmm11

	# This function only processes whole blocks.
	andq	\$-16, $len

	# Reverse input bytes to deserialize. We maintain the running
	# total in %xmm0.
	pshufb	%xmm10, %xmm0

	# Iterate over each block. On entry to each iteration, %xmm3 is zero.
	pxor	%xmm3, %xmm3
.Loop_ghash:
	# Incorporate the next block of input.
	movdqu	($in), %xmm1
	pshufb	%xmm10, %xmm1	# Reverse bytes.
	pxor	%xmm1, %xmm0

	# Split each byte into low (%xmm0) and high (%xmm1) halves.
	movdqa	%xmm11, %xmm1
	pandn	%xmm0, %xmm1
	psrld	\$4, %xmm1
	pand	%xmm11, %xmm0

	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
	# that, due to bit reversal, %xmm3 contains bits that fall off when
	# right-shifting, not left-shifting.
	pxor	%xmm2, %xmm2
	# %xmm3 is already zero at this point.
____

# We must reduce at least once every 7 rows, so divide into three chunks.
$code .= process_rows(5);
$code .= process_rows(5);
$code .= process_rows(6);

$code .= <<____;
	movdqa	%xmm2, %xmm0

	# Rewind $Htable for the next iteration.
	leaq	-256($Htable), $Htable

	# Advance input and continue.
	leaq	16($in), $in
	subq	\$16, $len
	jnz	.Loop_ghash

	# Reverse bytes and store the result.
	pshufb	%xmm10, %xmm0
	movdqu	%xmm0, ($Xi)

	# Zero any registers which contain secrets.
	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1
	pxor	%xmm2, %xmm2
	pxor	%xmm3, %xmm3
	pxor	%xmm4, %xmm4
	pxor	%xmm5, %xmm5
	pxor	%xmm6, %xmm6
____
$code .= <<____ if ($win64);
	movdqa	(%rsp), %xmm6
	movdqa	16(%rsp), %xmm10
	movdqa	32(%rsp), %xmm11
	addq	\$56, %rsp
____
$code .= <<____;
	ret
.Lghash_seh_end:
.cfi_endproc
.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3

.align	16
# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
# bytes in an XMM register.
.Lreverse_bytes:
.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
.Llow4_mask:
.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
____

if ($win64) {
  # Add unwind metadata for SEH.
  #
  # TODO(davidben): This is all manual right now. Once we've added SEH tests,
  # add support for emitting these in x86_64-xlate.pl, probably based on MASM
  # and Yasm's unwind directives, and unify with CFI. Then upstream it to
  # replace the error-prone and non-standard custom handlers.

  # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017
  my $UWOP_ALLOC_SMALL = 2;
  my $UWOP_SAVE_XMM128 = 8;

  $code .= <<____;
.section	.pdata
.align	4
	.rva	.Lgmult_seh_begin
	.rva	.Lgmult_seh_end
	.rva	.Lgmult_seh_info

	.rva	.Lghash_seh_begin
	.rva	.Lghash_seh_end
	.rva	.Lghash_seh_info

.section	.xdata
.align	8
.Lgmult_seh_info:
	.byte	1	# version 1, no flags
	.byte	.Lgmult_seh_prolog_end-.Lgmult_seh_begin
	.byte	5	# num_slots = 1 + 2 + 2
	.byte	0	# no frame register

	.byte	.Lgmult_seh_save_xmm10-.Lgmult_seh_begin
	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
	.value	1

	.byte	.Lgmult_seh_save_xmm6-.Lgmult_seh_begin
	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
	.value	0

	.byte	.Lgmult_seh_allocstack-.Lgmult_seh_begin
	.byte	@{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]}

.align	8
.Lghash_seh_info:
	.byte	1	# version 1, no flags
	.byte	.Lghash_seh_prolog_end-.Lghash_seh_begin
	.byte	7	# num_slots = 1 + 2 + 2 + 2
	.byte	0	# no frame register

	.byte	.Lghash_seh_save_xmm11-.Lghash_seh_begin
	.byte	@{[$UWOP_SAVE_XMM128 | (11 << 4)]}
	.value	2

	.byte	.Lghash_seh_save_xmm10-.Lghash_seh_begin
	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
	.value	1

	.byte	.Lghash_seh_save_xmm6-.Lghash_seh_begin
	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
	.value	0

	.byte	.Lghash_seh_allocstack-.Lghash_seh_begin
	.byte	@{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]}
____
}

print $code;
close STDOUT;
-												Add a constant-time pshufb-based GHASH implementation.

We currently require clmul instructions for constant-time GHASH
on x86_64. Otherwise, it falls back to a variable-time 4-bit table
implementation. However, a significant proportion of clients lack these
instructions.

Inspired by vpaes, we can use pshufb and a slightly different order of
incorporating the bits to make a constant-time GHASH. This requires
SSSE3, which is very common. Benchmarking old machines we had on hand,
it appears to be a no-op on Sandy Bridge and a small slowdown for
Penryn.

Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz):
(Note: these numbers are before 16-byte-aligning the table. That was an
improvement on Penryn, so it's possible Sandy Bridge is now better.)
Before:
Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s
Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s
Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s
Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s
Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s
After:
Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%]
Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%]
Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%]
Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%]
Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%]
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%]

Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz):
Before:
Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s
Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s
Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s
After:
Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%]
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%]
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%]
Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%]
Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%]
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%]

Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb
Reviewed-on: https://boringssl-review.googlesource.com/c/34267
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-01-09 03:35:56 +00:00
+								#!/usr/bin/env perl
 								# Copyright (c) 2019, Google Inc.
 								#
 								# Permission to use, copy, modify, and/or distribute this software for any
 								# purpose with or without fee is hereby granted, provided that the above
 								# copyright notice and this permission notice appear in all copies.
 								#
 								# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 								# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 								# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 								# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 								# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 								# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 								# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 								# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
 								# table-based GHASH implementation. It requires SSSE3 instructions.
 								#
 								# For background, the table-based strategy is a 4-bit windowed multiplication.
 								# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
 								# over 4-bit windows of the input and indexes them up into the table. Visually,
 								# it multiplies as in the schoolbook multiplication diagram below, but with
 								# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
 								# it incorporates the terms labeled '1' by indexing the most significant term
 								# of X into the table. Then it shifts and repeats for '2' and so on.
 								#
 								#        hhhhhh
 								#  *     xxxxxx
 								#  ============
 								#        666666
 								#       555555
 								#      444444
 								#     333333
 								#    222222
 								#   111111
 								#
 								# This implementation changes the order. We treat the table as a 16×16 matrix
 								# and transpose it. The first row is then the first byte of each multiple of H,
 								# and so on. We then reorder terms as below. Observe that the terms labeled '1'
 								# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
 								# pshufb instruction, using alternating terms of X in parallel as indices. This
 								# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
 								# repeat for each row.
 								#
 								#        hhhhhh
 								#  *     xxxxxx
 								#  ============
 								#        224466
 								#       113355
 								#      224466
 								#     113355
 								#    224466
 								#   113355
 								#
 								# Next we account for GCM's confusing bit order. The "first" bit is the least
 								# significant coefficient, but GCM treats the most sigificant bit within a byte
 								# as first. Bytes are little-endian, and bits are big-endian. We reverse the
 								# bytes in XMM registers for a consistent bit and byte ordering, but this means
 								# the least significant bit is the most significant coefficient and vice versa.
 								#
 								# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
 								# bit ordering within the XMM register, rather than the reversed coefficient
 								# ordering. Low bits are less significant bits and more significant
 								# coefficients. Right-shifts move from MSB to the LSB and correspond to
 								# increasing the power of each coefficient.
 								#
 								# Note this bit reversal enters into the table's column indices. H*1 is stored
 								# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
 								# table rows contain more significant coefficients, so we iterate forwards.
 								use strict;
 								my $flavour = shift;
 								my $output  = shift;
 								if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 								my $win64 = 0;
 								$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 								$0 =~ m/(.*[\/\\])[^\/\\]+$/;
 								my $dir = $1;
 								my $xlate;
 								( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 								( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 								die "can't locate x86_64-xlate.pl";
 								open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
 								*STDOUT = *OUT;
 								my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
 								                                        ("%rdi", "%rsi", "%rdx", "%rcx");
 								my $code = <<____;
 								.text
 								# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
 								# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
 								# formatted as described above.
 								# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
 								.type	gcm_gmult_ssse3, \@abi-omnipotent
 								.globl	gcm_gmult_ssse3
 								.align	16
 								gcm_gmult_ssse3:
 								.cfi_startproc
 								.Lgmult_seh_begin:
 								____
 								$code .= <<____ if ($win64);
 									subq	\$40, %rsp
 								.Lgmult_seh_allocstack:
 									movdqa	%xmm6, (%rsp)
 								.Lgmult_seh_save_xmm6:
 									movdqa	%xmm10, 16(%rsp)
 								.Lgmult_seh_save_xmm10:
 								.Lgmult_seh_prolog_end:
 								____
 								$code .= <<____;
 									movdqu	($Xi), %xmm0
 									movdqa	.Lreverse_bytes(%rip), %xmm10
 									movdqa	.Llow4_mask(%rip), %xmm2
 									# Reverse input bytes to deserialize.
 									pshufb	%xmm10, %xmm0
 									# Split each byte into low (%xmm0) and high (%xmm1) halves.
 									movdqa	%xmm2, %xmm1
 									pandn	%xmm0, %xmm1
 									psrld	\$4, %xmm1
 									pand	%xmm2, %xmm0
 									# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
 									# that, due to bit reversal, %xmm3 contains bits that fall off when
 									# right-shifting, not left-shifting.
 									pxor	%xmm2, %xmm2
 									pxor	%xmm3, %xmm3
 								____
 								my $call_counter = 0;
 								# process_rows returns assembly code to process $rows rows of the table. On
 								# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
 								# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
 								# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
 								# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
 								sub process_rows {
 								    my ($rows) = @_;
 								    $call_counter++;
 								    # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
 								    # and psrlq shifts the two 64-bit halves separately. Each row produces 8
 								    # bits of carry, and the reduction needs an additional 7-bit shift. This
 								    # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
 								    # at a time.
 								    die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
 								    return <<____;
 									movq	\$$rows, %rax
 								.Loop_row_$call_counter:
 									movdqa	($Htable), %xmm4
 									leaq	16($Htable), $Htable
 									# Right-shift %xmm2 and %xmm3 by 8 bytes.
 									movdqa	%xmm2, %xmm6
 									palignr	\$1, %xmm3, %xmm6
 									movdqa	%xmm6, %xmm3
 									psrldq	\$1, %xmm2
 									# Load the next table row and index the low and high bits of the input.
 									# Note the low (respectively, high) half corresponds to more
 									# (respectively, less) significant coefficients.
 									movdqa	%xmm4, %xmm5
 									pshufb	%xmm0, %xmm4
 									pshufb	%xmm1, %xmm5
 									# Add the high half (%xmm5) without shifting.
 									pxor	%xmm5, %xmm2
 									# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
 									# add into the carry register (%xmm3).
 									movdqa	%xmm4, %xmm5
 									psllq	\$60, %xmm5
 									movdqa	%xmm5, %xmm6
 									pslldq	\$8, %xmm6
 									pxor	%xmm6, %xmm3
 									# Next, add into %xmm2.
 									psrldq	\$8, %xmm5
 									pxor	%xmm5, %xmm2
 									psrlq	\$4, %xmm4
 									pxor	%xmm4, %xmm2
 									subq	\$1, %rax
 									jnz	.Loop_row_$call_counter
 									# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
 									# x^7, so we shift and XOR four times.
 									pxor	%xmm3, %xmm2	# x^0 = 0
 									psrlq	\$1, %xmm3
 									pxor	%xmm3, %xmm2	# x^1 = x
 									psrlq	\$1, %xmm3
 									pxor	%xmm3, %xmm2	# x^(1+1) = x^2
 									psrlq	\$5, %xmm3
 									pxor	%xmm3, %xmm2	# x^(1+1+5) = x^7
 									pxor	%xmm3, %xmm3
 								____
 								}
 								# We must reduce at least once every 7 rows, so divide into three chunks.
 								$code .= process_rows(5);
 								$code .= process_rows(5);
 								$code .= process_rows(6);
 								$code .= <<____;
 									# Store the result. Reverse bytes to serialize.
 									pshufb	%xmm10, %xmm2
 									movdqu	%xmm2, ($Xi)
 									# Zero any registers which contain secrets.
 									pxor	%xmm0, %xmm0
 									pxor	%xmm1, %xmm1
 									pxor	%xmm2, %xmm2
 									pxor	%xmm3, %xmm3
 									pxor	%xmm4, %xmm4
 									pxor	%xmm5, %xmm5
 									pxor	%xmm6, %xmm6
 								____
 								$code .= <<____ if ($win64);
 									movdqa	(%rsp), %xmm6
 									movdqa	16(%rsp), %xmm10
 									addq	\$40, %rsp
 								____
 								$code .= <<____;
 									ret
 								.Lgmult_seh_end:
 								.cfi_endproc
 								.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
 								____
 								$code .= <<____;
 								# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
 								# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
 								# serialized byte representation. |Htable| is formatted as described above.
 								# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
 								#                      size_t len);
 								.type	gcm_ghash_ssse3, \@abi-omnipotent
 								.globl	gcm_ghash_ssse3
 								.align	16
 								gcm_ghash_ssse3:
 								.Lghash_seh_begin:
 								.cfi_startproc
 								____
 								$code .= <<____ if ($win64);
 									subq	\$56, %rsp
 								.Lghash_seh_allocstack:
 									movdqa	%xmm6, (%rsp)
 								.Lghash_seh_save_xmm6:
 									movdqa	%xmm10, 16(%rsp)
 								.Lghash_seh_save_xmm10:
 									movdqa	%xmm11, 32(%rsp)
 								.Lghash_seh_save_xmm11:
 								.Lghash_seh_prolog_end:
 								____
 								$code .= <<____;
 									movdqu	($Xi), %xmm0
 									movdqa	.Lreverse_bytes(%rip), %xmm10
 									movdqa	.Llow4_mask(%rip), %xmm11
 									# This function only processes whole blocks.
 									andq	\$-16, $len
 									# Reverse input bytes to deserialize. We maintain the running
 									# total in %xmm0.
 									pshufb	%xmm10, %xmm0
 									# Iterate over each block. On entry to each iteration, %xmm3 is zero.
 									pxor	%xmm3, %xmm3
 								.Loop_ghash:
 									# Incorporate the next block of input.
 									movdqu	($in), %xmm1
 									pshufb	%xmm10, %xmm1	# Reverse bytes.
 									pxor	%xmm1, %xmm0
 									# Split each byte into low (%xmm0) and high (%xmm1) halves.
 									movdqa	%xmm11, %xmm1
 									pandn	%xmm0, %xmm1
 									psrld	\$4, %xmm1
 									pand	%xmm11, %xmm0
 									# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
 									# that, due to bit reversal, %xmm3 contains bits that fall off when
 									# right-shifting, not left-shifting.
 									pxor	%xmm2, %xmm2
 									# %xmm3 is already zero at this point.
 								____
 								# We must reduce at least once every 7 rows, so divide into three chunks.
 								$code .= process_rows(5);
 								$code .= process_rows(5);
 								$code .= process_rows(6);
 								$code .= <<____;
 									movdqa	%xmm2, %xmm0
 									# Rewind $Htable for the next iteration.
 									leaq	-256($Htable), $Htable
 									# Advance input and continue.
 									leaq	16($in), $in
 									subq	\$16, $len
 									jnz	.Loop_ghash
 									# Reverse bytes and store the result.
 									pshufb	%xmm10, %xmm0
 									movdqu	%xmm0, ($Xi)
 									# Zero any registers which contain secrets.
 									pxor	%xmm0, %xmm0
 									pxor	%xmm1, %xmm1
 									pxor	%xmm2, %xmm2
 									pxor	%xmm3, %xmm3
 									pxor	%xmm4, %xmm4
 									pxor	%xmm5, %xmm5
 									pxor	%xmm6, %xmm6
 								____
 								$code .= <<____ if ($win64);
 									movdqa	(%rsp), %xmm6
 									movdqa	16(%rsp), %xmm10
 									movdqa	32(%rsp), %xmm11
 									addq	\$56, %rsp
 								____
 								$code .= <<____;
 									ret
 								.Lghash_seh_end:
 								.cfi_endproc
 								.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3
 								.align	16
 								# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
 								# bytes in an XMM register.
 								.Lreverse_bytes:
 								.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 								# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
 								.Llow4_mask:
 								.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 								____
 								if ($win64) {
 								  # Add unwind metadata for SEH.
 								  #
 								  # TODO(davidben): This is all manual right now. Once we've added SEH tests,
 								  # add support for emitting these in x86_64-xlate.pl, probably based on MASM
 								  # and Yasm's unwind directives, and unify with CFI. Then upstream it to
 								  # replace the error-prone and non-standard custom handlers.
 								  # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017
 								  my $UWOP_ALLOC_SMALL = 2;
 								  my $UWOP_SAVE_XMM128 = 8;
 								  $code .= <<____;
 								.section	.pdata
 								.align	4
 									.rva	.Lgmult_seh_begin
 									.rva	.Lgmult_seh_end
 									.rva	.Lgmult_seh_info
 									.rva	.Lghash_seh_begin
 									.rva	.Lghash_seh_end
 									.rva	.Lghash_seh_info
 								.section	.xdata
 								.align	8
 								.Lgmult_seh_info:
 									.byte	1	# version 1, no flags
 									.byte	.Lgmult_seh_prolog_end-.Lgmult_seh_begin
 									.byte	5	# num_slots = 1 + 2 + 2
 									.byte	0	# no frame register
-												Fix the order of Windows unwind codes.

The unwind tester suggests Windows doesn't care, but the documentation
says that unwind codes should be sorted in descending offset, which
means the last instruction should be first.

https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2017#struct-unwind_code

Bug: 259
Change-Id: I21e54c362e18e0405f980005112cc3f7c417c70c
Reviewed-on: https://boringssl-review.googlesource.com/c/34785
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-02-03 19:53:11 +00:00
+									.byte	.Lgmult_seh_save_xmm10-.Lgmult_seh_begin
 									.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
 									.value	1
-												Add a constant-time pshufb-based GHASH implementation.

We currently require clmul instructions for constant-time GHASH
on x86_64. Otherwise, it falls back to a variable-time 4-bit table
implementation. However, a significant proportion of clients lack these
instructions.

Inspired by vpaes, we can use pshufb and a slightly different order of
incorporating the bits to make a constant-time GHASH. This requires
SSSE3, which is very common. Benchmarking old machines we had on hand,
it appears to be a no-op on Sandy Bridge and a small slowdown for
Penryn.

Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz):
(Note: these numbers are before 16-byte-aligning the table. That was an
improvement on Penryn, so it's possible Sandy Bridge is now better.)
Before:
Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s
Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s
Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s
Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s
Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s
After:
Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%]
Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%]
Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%]
Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%]
Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%]
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%]

Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz):
Before:
Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s
Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s
Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s
After:
Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%]
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%]
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%]
Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%]
Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%]
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%]

Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb
Reviewed-on: https://boringssl-review.googlesource.com/c/34267
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-01-09 03:35:56 +00:00
 									.byte	.Lgmult_seh_save_xmm6-.Lgmult_seh_begin
 									.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
 									.value	0
-												Fix the order of Windows unwind codes.

The unwind tester suggests Windows doesn't care, but the documentation
says that unwind codes should be sorted in descending offset, which
means the last instruction should be first.

https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2017#struct-unwind_code

Bug: 259
Change-Id: I21e54c362e18e0405f980005112cc3f7c417c70c
Reviewed-on: https://boringssl-review.googlesource.com/c/34785
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-02-03 19:53:11 +00:00
+									.byte	.Lgmult_seh_allocstack-.Lgmult_seh_begin
 									.byte	@{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]}
-												Add a constant-time pshufb-based GHASH implementation.

We currently require clmul instructions for constant-time GHASH
on x86_64. Otherwise, it falls back to a variable-time 4-bit table
implementation. However, a significant proportion of clients lack these
instructions.

Inspired by vpaes, we can use pshufb and a slightly different order of
incorporating the bits to make a constant-time GHASH. This requires
SSSE3, which is very common. Benchmarking old machines we had on hand,
it appears to be a no-op on Sandy Bridge and a small slowdown for
Penryn.

Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz):
(Note: these numbers are before 16-byte-aligning the table. That was an
improvement on Penryn, so it's possible Sandy Bridge is now better.)
Before:
Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s
Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s
Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s
Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s
Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s
After:
Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%]
Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%]
Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%]
Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%]
Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%]
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%]

Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz):
Before:
Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s
Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s
Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s
After:
Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%]
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%]
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%]
Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%]
Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%]
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%]

Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb
Reviewed-on: https://boringssl-review.googlesource.com/c/34267
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-01-09 03:35:56 +00:00
 								.align	8
 								.Lghash_seh_info:
 									.byte	1	# version 1, no flags
 									.byte	.Lghash_seh_prolog_end-.Lghash_seh_begin
 									.byte	7	# num_slots = 1 + 2 + 2 + 2
 									.byte	0	# no frame register
-												Fix the order of Windows unwind codes.

The unwind tester suggests Windows doesn't care, but the documentation
says that unwind codes should be sorted in descending offset, which
means the last instruction should be first.

https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2017#struct-unwind_code

Bug: 259
Change-Id: I21e54c362e18e0405f980005112cc3f7c417c70c
Reviewed-on: https://boringssl-review.googlesource.com/c/34785
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-02-03 19:53:11 +00:00
+									.byte	.Lghash_seh_save_xmm11-.Lghash_seh_begin
 									.byte	@{[$UWOP_SAVE_XMM128 | (11 << 4)]}
 									.value	2
-												Add a constant-time pshufb-based GHASH implementation.

We currently require clmul instructions for constant-time GHASH
on x86_64. Otherwise, it falls back to a variable-time 4-bit table
implementation. However, a significant proportion of clients lack these
instructions.

Inspired by vpaes, we can use pshufb and a slightly different order of
incorporating the bits to make a constant-time GHASH. This requires
SSSE3, which is very common. Benchmarking old machines we had on hand,
it appears to be a no-op on Sandy Bridge and a small slowdown for
Penryn.

Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz):
(Note: these numbers are before 16-byte-aligning the table. That was an
improvement on Penryn, so it's possible Sandy Bridge is now better.)
Before:
Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s
Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s
Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s
Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s
Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s
After:
Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%]
Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%]
Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%]
Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%]
Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%]
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%]

Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz):
Before:
Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s
Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s
Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s
After:
Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%]
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%]
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%]
Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%]
Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%]
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%]

Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb
Reviewed-on: https://boringssl-review.googlesource.com/c/34267
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-01-09 03:35:56 +00:00
 									.byte	.Lghash_seh_save_xmm10-.Lghash_seh_begin
 									.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
 									.value	1
-												Fix the order of Windows unwind codes.

The unwind tester suggests Windows doesn't care, but the documentation
says that unwind codes should be sorted in descending offset, which
means the last instruction should be first.

https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2017#struct-unwind_code

Bug: 259
Change-Id: I21e54c362e18e0405f980005112cc3f7c417c70c
Reviewed-on: https://boringssl-review.googlesource.com/c/34785
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-02-03 19:53:11 +00:00
+									.byte	.Lghash_seh_save_xmm6-.Lghash_seh_begin
 									.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
 									.value	0
 									.byte	.Lghash_seh_allocstack-.Lghash_seh_begin
 									.byte	@{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]}
-												Add a constant-time pshufb-based GHASH implementation.

We currently require clmul instructions for constant-time GHASH
on x86_64. Otherwise, it falls back to a variable-time 4-bit table
implementation. However, a significant proportion of clients lack these
instructions.

Inspired by vpaes, we can use pshufb and a slightly different order of
incorporating the bits to make a constant-time GHASH. This requires
SSSE3, which is very common. Benchmarking old machines we had on hand,
it appears to be a no-op on Sandy Bridge and a small slowdown for
Penryn.

Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz):
(Note: these numbers are before 16-byte-aligning the table. That was an
improvement on Penryn, so it's possible Sandy Bridge is now better.)
Before:
Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s
Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s
Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s
Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s
Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s
After:
Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%]
Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%]
Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%]
Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%]
Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%]
Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%]

Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz):
Before:
Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s
Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s
Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s
After:
Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%]
Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%]
Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%]
Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%]
Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%]
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%]

Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb
Reviewed-on: https://boringssl-review.googlesource.com/c/34267
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2019-01-09 03:35:56 +00:00
+								____
 								}
 								print $code;
 								close STDOUT;