#!/usr/bin/env perl # Copyright (c) 2019, Google Inc. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit # table-based GHASH implementation. It requires SSSE3 instructions. # # For background, the table-based strategy is a 4-bit windowed multiplication. # It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops # over 4-bit windows of the input and indexes them up into the table. Visually, # it multiplies as in the schoolbook multiplication diagram below, but with # more terms. (Each term is 4 bits, so there are 32 terms in each row.) First # it incorporates the terms labeled '1' by indexing the most significant term # of X into the table. Then it shifts and repeats for '2' and so on. # # hhhhhh # * xxxxxx # ============ # 666666 # 555555 # 444444 # 333333 # 222222 # 111111 # # This implementation changes the order. We treat the table as a 16×16 matrix # and transpose it. The first row is then the first byte of each multiple of H, # and so on. We then reorder terms as below. Observe that the terms labeled '1' # and '2' are all lookups into the first row, etc. This maps well to the SSSE3 # pshufb instruction, using alternating terms of X in parallel as indices. This # alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and # repeat for each row. # # hhhhhh # * xxxxxx # ============ # 224466 # 113355 # 224466 # 113355 # 224466 # 113355 # # Next we account for GCM's confusing bit order. The "first" bit is the least # significant coefficient, but GCM treats the most sigificant bit within a byte # as first. Bytes are little-endian, and bits are big-endian. We reverse the # bytes in XMM registers for a consistent bit and byte ordering, but this means # the least significant bit is the most significant coefficient and vice versa. # # For consistency, "low", "high", "left-shift", and "right-shift" refer to the # bit ordering within the XMM register, rather than the reversed coefficient # ordering. Low bits are less significant bits and more significant # coefficients. Right-shifts move from MSB to the LSB and correspond to # increasing the power of each coefficient. # # Note this bit reversal enters into the table's column indices. H*1 is stored # in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier # table rows contain more significant coefficients, so we iterate forwards. use strict; my $flavour = shift; my $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } my $win64 = 0; $win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir = $1; my $xlate; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT = *OUT; my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") : ("%rdi", "%rsi", "%rdx", "%rcx"); my $code = <<____; .text # gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. # |Xi| is represented in GHASH's serialized byte representation. |Htable| is # formatted as described above. # void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); .type gcm_gmult_ssse3, \@abi-omnipotent .globl gcm_gmult_ssse3 .align 16 gcm_gmult_ssse3: .cfi_startproc .Lgmult_seh_begin: ____ $code .= <<____ if ($win64); subq \$40, %rsp .Lgmult_seh_allocstack: movdqa %xmm6, (%rsp) .Lgmult_seh_save_xmm6: movdqa %xmm10, 16(%rsp) .Lgmult_seh_save_xmm10: .Lgmult_seh_prolog_end: ____ $code .= <<____; movdqu ($Xi), %xmm0 movdqa .Lreverse_bytes(%rip), %xmm10 movdqa .Llow4_mask(%rip), %xmm2 # Reverse input bytes to deserialize. pshufb %xmm10, %xmm0 # Split each byte into low (%xmm0) and high (%xmm1) halves. movdqa %xmm2, %xmm1 pandn %xmm0, %xmm1 psrld \$4, %xmm1 pand %xmm2, %xmm0 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note # that, due to bit reversal, %xmm3 contains bits that fall off when # right-shifting, not left-shifting. pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 ____ my $call_counter = 0; # process_rows returns assembly code to process $rows rows of the table. On # input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the # low and high halves of the input. The result so far is passed in %xmm2. %xmm3 # must be zero. On output, $Htable is advanced to the next row and %xmm2 is # updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6. sub process_rows { my ($rows) = @_; $call_counter++; # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes, # and psrlq shifts the two 64-bit halves separately. Each row produces 8 # bits of carry, and the reduction needs an additional 7-bit shift. This # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows # at a time. die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); return <<____; movq \$$rows, %rax .Loop_row_$call_counter: movdqa ($Htable), %xmm4 leaq 16($Htable), $Htable # Right-shift %xmm2 and %xmm3 by 8 bytes. movdqa %xmm2, %xmm6 palignr \$1, %xmm3, %xmm6 movdqa %xmm6, %xmm3 psrldq \$1, %xmm2 # Load the next table row and index the low and high bits of the input. # Note the low (respectively, high) half corresponds to more # (respectively, less) significant coefficients. movdqa %xmm4, %xmm5 pshufb %xmm0, %xmm4 pshufb %xmm1, %xmm5 # Add the high half (%xmm5) without shifting. pxor %xmm5, %xmm2 # Add the low half (%xmm4). This must be right-shifted by 4 bits. First, # add into the carry register (%xmm3). movdqa %xmm4, %xmm5 psllq \$60, %xmm5 movdqa %xmm5, %xmm6 pslldq \$8, %xmm6 pxor %xmm6, %xmm3 # Next, add into %xmm2. psrldq \$8, %xmm5 pxor %xmm5, %xmm2 psrlq \$4, %xmm4 pxor %xmm4, %xmm2 subq \$1, %rax jnz .Loop_row_$call_counter # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + # x^7, so we shift and XOR four times. pxor %xmm3, %xmm2 # x^0 = 0 psrlq \$1, %xmm3 pxor %xmm3, %xmm2 # x^1 = x psrlq \$1, %xmm3 pxor %xmm3, %xmm2 # x^(1+1) = x^2 psrlq \$5, %xmm3 pxor %xmm3, %xmm2 # x^(1+1+5) = x^7 pxor %xmm3, %xmm3 ____ } # We must reduce at least once every 7 rows, so divide into three chunks. $code .= process_rows(5); $code .= process_rows(5); $code .= process_rows(6); $code .= <<____; # Store the result. Reverse bytes to serialize. pshufb %xmm10, %xmm2 movdqu %xmm2, ($Xi) # Zero any registers which contain secrets. pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 ____ $code .= <<____ if ($win64); movdqa (%rsp), %xmm6 movdqa 16(%rsp), %xmm10 addq \$40, %rsp ____ $code .= <<____; ret .Lgmult_seh_end: .cfi_endproc .size gcm_gmult_ssse3,.-gcm_gmult_ssse3 ____ $code .= <<____; # gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as # the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's # serialized byte representation. |Htable| is formatted as described above. # void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, # size_t len); .type gcm_ghash_ssse3, \@abi-omnipotent .globl gcm_ghash_ssse3 .align 16 gcm_ghash_ssse3: .Lghash_seh_begin: .cfi_startproc ____ $code .= <<____ if ($win64); subq \$56, %rsp .Lghash_seh_allocstack: movdqa %xmm6, (%rsp) .Lghash_seh_save_xmm6: movdqa %xmm10, 16(%rsp) .Lghash_seh_save_xmm10: movdqa %xmm11, 32(%rsp) .Lghash_seh_save_xmm11: .Lghash_seh_prolog_end: ____ $code .= <<____; movdqu ($Xi), %xmm0 movdqa .Lreverse_bytes(%rip), %xmm10 movdqa .Llow4_mask(%rip), %xmm11 # This function only processes whole blocks. andq \$-16, $len # Reverse input bytes to deserialize. We maintain the running # total in %xmm0. pshufb %xmm10, %xmm0 # Iterate over each block. On entry to each iteration, %xmm3 is zero. pxor %xmm3, %xmm3 .Loop_ghash: # Incorporate the next block of input. movdqu ($in), %xmm1 pshufb %xmm10, %xmm1 # Reverse bytes. pxor %xmm1, %xmm0 # Split each byte into low (%xmm0) and high (%xmm1) halves. movdqa %xmm11, %xmm1 pandn %xmm0, %xmm1 psrld \$4, %xmm1 pand %xmm11, %xmm0 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note # that, due to bit reversal, %xmm3 contains bits that fall off when # right-shifting, not left-shifting. pxor %xmm2, %xmm2 # %xmm3 is already zero at this point. ____ # We must reduce at least once every 7 rows, so divide into three chunks. $code .= process_rows(5); $code .= process_rows(5); $code .= process_rows(6); $code .= <<____; movdqa %xmm2, %xmm0 # Rewind $Htable for the next iteration. leaq -256($Htable), $Htable # Advance input and continue. leaq 16($in), $in subq \$16, $len jnz .Loop_ghash # Reverse bytes and store the result. pshufb %xmm10, %xmm0 movdqu %xmm0, ($Xi) # Zero any registers which contain secrets. pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 ____ $code .= <<____ if ($win64); movdqa (%rsp), %xmm6 movdqa 16(%rsp), %xmm10 movdqa 32(%rsp), %xmm11 addq \$56, %rsp ____ $code .= <<____; ret .Lghash_seh_end: .cfi_endproc .size gcm_ghash_ssse3,.-gcm_ghash_ssse3 .align 16 # .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the # bytes in an XMM register. .Lreverse_bytes: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 # .Llow4_mask is an XMM mask which selects the low four bits of each byte. .Llow4_mask: .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f ____ if ($win64) { # Add unwind metadata for SEH. # # TODO(davidben): This is all manual right now. Once we've added SEH tests, # add support for emitting these in x86_64-xlate.pl, probably based on MASM # and Yasm's unwind directives, and unify with CFI. Then upstream it to # replace the error-prone and non-standard custom handlers. # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017 my $UWOP_ALLOC_SMALL = 2; my $UWOP_SAVE_XMM128 = 8; $code .= <<____; .section .pdata .align 4 .rva .Lgmult_seh_begin .rva .Lgmult_seh_end .rva .Lgmult_seh_info .rva .Lghash_seh_begin .rva .Lghash_seh_end .rva .Lghash_seh_info .section .xdata .align 8 .Lgmult_seh_info: .byte 1 # version 1, no flags .byte .Lgmult_seh_prolog_end-.Lgmult_seh_begin .byte 5 # num_slots = 1 + 2 + 2 .byte 0 # no frame register .byte .Lgmult_seh_save_xmm10-.Lgmult_seh_begin .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]} .value 1 .byte .Lgmult_seh_save_xmm6-.Lgmult_seh_begin .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]} .value 0 .byte .Lgmult_seh_allocstack-.Lgmult_seh_begin .byte @{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]} .align 8 .Lghash_seh_info: .byte 1 # version 1, no flags .byte .Lghash_seh_prolog_end-.Lghash_seh_begin .byte 7 # num_slots = 1 + 2 + 2 + 2 .byte 0 # no frame register .byte .Lghash_seh_save_xmm11-.Lghash_seh_begin .byte @{[$UWOP_SAVE_XMM128 | (11 << 4)]} .value 2 .byte .Lghash_seh_save_xmm10-.Lghash_seh_begin .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]} .value 1 .byte .Lghash_seh_save_xmm6-.Lghash_seh_begin .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]} .value 0 .byte .Lghash_seh_allocstack-.Lghash_seh_begin .byte @{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]} ____ } print $code; close STDOUT;