Use Barrett reduction in CBC processing rather than tricks.

Division isn't constant-time on Intel chips so the code was adding a large multiple of md_size to try and force the operation to always take the maximum amount of time. I'm less convinced, these days, that compilers aren't going to get smart enough to optimise that away so use Barrett reduction instead. Change-Id: Ib8c514192682a2fcb4b1fb7e7c6dd1301d9888d0 Reviewed-on: https://boringssl-review.googlesource.com/6906 Reviewed-by: David Benjamin <davidben@chromium.org> Reviewed-by: Adam Langley <alangley@gmail.com>
2016-01-15 11:16:41 -08:00 · 2016-01-15 11:16:41 -08:00 · 54a8d7c14f
commit 54a8d7c14f
parent 91b2501f02
1 changed files with 39 additions and 10 deletions
--- a/crypto/cipher/tls_cbc.c
+++ b/crypto/cipher/tls_cbc.c
@ -146,7 +146,6 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, unsigned md_size,
   * the MAC's position can only vary by 255 bytes. */
  unsigned scan_start = 0;
  unsigned i, j;
-  unsigned div_spoiler;
  unsigned rotate_offset;

  assert(orig_len >= in_len);
@ -161,16 +160,46 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, unsigned md_size,
  if (orig_len > md_size + 255 + 1) {
    scan_start = orig_len - (md_size + 255 + 1);
  }
-  /* div_spoiler contains a multiple of md_size that is used to cause the
-   * modulo operation to be constant time. Without this, the time varies
-   * based on the amount of padding when running on Intel chips at least.
+
+  /* Ideally the next statement would be:
   *
-   * The aim of right-shifting md_size is so that the compiler doesn't
-   * figure out that it can remove div_spoiler as that would require it
-   * to prove that md_size is always even, which I hope is beyond it. */
-  div_spoiler = md_size >> 1;
-  div_spoiler <<= (sizeof(div_spoiler) - 1) * 8;
-  rotate_offset = (div_spoiler + mac_start - scan_start) % md_size;
+   *   rotate_offset = (mac_start - scan_start) % md_size;
+   *
+   * However, division is not a constant-time operation (at least on Intel
+   * chips). Thus we enumerate the possible values of md_size and handle each
+   * separately. The value of |md_size| is public information (it's determined
+   * by the cipher suite in the ServerHello) so our timing can vary based on
+   * its value. */
+
+  rotate_offset = mac_start - scan_start;
+  /* rotate_offset can be, at most, 255 (bytes of padding) + 1 (padding length)
+   * + md_size = 256 + 48 (since SHA-384 is the largest hash) = 304. */
+  assert(rotate_offset <= 304);
+
+  if (md_size == 16) {
+    rotate_offset &= 15;
+  } else if (md_size == 20) {
+    /* 1/20 is approximated as 25/512 and then Barrett reduction is used.
+     * Analytically, this is correct for 0 <= rotate_offset <= 853. */
+    unsigned q = (rotate_offset * 25) >> 9;
+    rotate_offset -= q * 20;
+    rotate_offset -=
+        constant_time_select(constant_time_ge(rotate_offset, 20), 20, 0);
+  } else if (md_size == 32) {
+    rotate_offset &= 31;
+  } else if (md_size == 48) {
+    /* 1/48 is approximated as 10/512 and then Barrett reduction is used.
+     * Analytically, this is correct for 0 <= rotate_offset <= 768. */
+    unsigned q = (rotate_offset * 10) >> 9;
+    rotate_offset -= q * 48;
+    rotate_offset -=
+        constant_time_select(constant_time_ge(rotate_offset, 48), 48, 0);
+  } else {
+    /* This should be impossible therefore this path doesn't run in constant
+     * time. */
+    assert(0);
+    rotate_offset = rotate_offset % md_size;
+  }

  memset(rotated_mac, 0, md_size);
  for (i = scan_start, j = 0; i < orig_len; i++) {