2018-02-01 21:49:18 +00:00
|
|
|
/*
|
|
|
|
* Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
|
|
* Copyright (c) 2012, Intel Corporation. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Licensed under the OpenSSL license (the "License"). You may not use
|
|
|
|
* this file except in compliance with the License. You can obtain a copy
|
|
|
|
* in the file LICENSE in the source distribution or at
|
|
|
|
* https://www.openssl.org/source/license.html
|
|
|
|
*
|
|
|
|
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
|
|
|
|
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
|
|
|
|
* (2) University of Haifa, Israel
|
|
|
|
*/
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2018-02-13 20:14:30 +00:00
|
|
|
#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H
|
|
|
|
#define OPENSSL_HEADER_BN_RSAZ_EXP_H
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
#include <openssl/bn.h>
|
2019-01-26 20:57:51 +00:00
|
|
|
#include <openssl/cpu.h>
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2018-06-04 18:16:16 +01:00
|
|
|
#include "internal.h"
|
|
|
|
|
2019-01-26 20:57:51 +00:00
|
|
|
#if defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
|
|
|
|
#define RSAZ_ENABLED
|
|
|
|
|
|
|
|
|
2018-02-13 20:14:30 +00:00
|
|
|
// RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent|
|
|
|
|
// modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
|
|
|
|
// the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
|
Replace |alloca| in |BN_mod_exp_mont_consttime|.
|alloca| is dangerous and poorly specified, according to any
description of |alloca|. It's also hard for some analysis tools to
reason about.
The code here assumed |alloca| is a macro, which isn't a valid
assumption. Depending on what which headers are included and what
toolchain is being used, |alloca| may or may not be defined as a macro,
and this might change over time if/when toolchains are updated. Or, we
might be doing static analysis and/or dynamic analysis with a different
configuration w.r.t. the availability of |alloca| than production
builds use.
Regardless, the |alloca| code path only kicked in when the inputs are
840 bits or smaller. Since the multi-prime RSA support was removed, for
interesting RSA key sizes the input will be at least 1024 bits and this
code path won't be triggered since powerbufLen will be larger than 3072
bytes in those cases. ECC inversion via Fermat's Little Theorem has its
own constant-time exponentiation so there are no cases where smaller
inputs need to be fast.
The RSAZ code avoids the |OPENSSL_malloc| for 2048-bit RSA keys.
Increasingly the RSAZ code won't be used though, since it will be
skipped over on Broadwell+ CPUs. Generalize the RSAZ stack allocation
to work for non-RSAZ code paths. In order to ensure this doesn't cause
too much stack usage on platforms where RSAZ wasn't already being used,
only do so on x86-64, which already has this large stack size
requirement due to RSAZ.
This change will make it easier to refactor |BN_mod_exp_mont_consttime|
to do that more safely and in a way that's more compatible with various
analysis tools.
This is also a step towards eliminating the |uintptr_t|-based alignment
hack.
Since this change increases the number of times |OPENSSL_free| is
skipped, I've added an explicit |OPENSSL_cleanse| to ensure the
zeroization is done. This should be done regardless of the other changes
here.
Change-Id: I8a161ce2720a26127e85fff7513f394883e50b2e
Reviewed-on: https://boringssl-review.googlesource.com/28584
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
2018-05-17 04:24:20 +01:00
|
|
|
// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a
|
|
|
|
// temporary buffer that must be aligned to |MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH|
|
|
|
|
// bytes.
|
2018-02-13 20:14:30 +00:00
|
|
|
void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
|
|
|
|
const BN_ULONG exponent[16],
|
|
|
|
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
|
Replace |alloca| in |BN_mod_exp_mont_consttime|.
|alloca| is dangerous and poorly specified, according to any
description of |alloca|. It's also hard for some analysis tools to
reason about.
The code here assumed |alloca| is a macro, which isn't a valid
assumption. Depending on what which headers are included and what
toolchain is being used, |alloca| may or may not be defined as a macro,
and this might change over time if/when toolchains are updated. Or, we
might be doing static analysis and/or dynamic analysis with a different
configuration w.r.t. the availability of |alloca| than production
builds use.
Regardless, the |alloca| code path only kicked in when the inputs are
840 bits or smaller. Since the multi-prime RSA support was removed, for
interesting RSA key sizes the input will be at least 1024 bits and this
code path won't be triggered since powerbufLen will be larger than 3072
bytes in those cases. ECC inversion via Fermat's Little Theorem has its
own constant-time exponentiation so there are no cases where smaller
inputs need to be fast.
The RSAZ code avoids the |OPENSSL_malloc| for 2048-bit RSA keys.
Increasingly the RSAZ code won't be used though, since it will be
skipped over on Broadwell+ CPUs. Generalize the RSAZ stack allocation
to work for non-RSAZ code paths. In order to ensure this doesn't cause
too much stack usage on platforms where RSAZ wasn't already being used,
only do so on x86-64, which already has this large stack size
requirement due to RSAZ.
This change will make it easier to refactor |BN_mod_exp_mont_consttime|
to do that more safely and in a way that's more compatible with various
analysis tools.
This is also a step towards eliminating the |uintptr_t|-based alignment
hack.
Since this change increases the number of times |OPENSSL_free| is
skipped, I've added an explicit |OPENSSL_cleanse| to ensure the
zeroization is done. This should be done regardless of the other changes
here.
Change-Id: I8a161ce2720a26127e85fff7513f394883e50b2e
Reviewed-on: https://boringssl-review.googlesource.com/28584
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
2018-05-17 04:24:20 +01:00
|
|
|
BN_ULONG k0,
|
|
|
|
BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
|
2018-02-13 20:14:30 +00:00
|
|
|
|
2019-01-26 20:57:51 +00:00
|
|
|
OPENSSL_INLINE int rsaz_avx2_capable(void) {
|
|
|
|
const uint32_t *cap = OPENSSL_ia32cap_get();
|
|
|
|
return (cap[2] & (1 << 5)) != 0; // AVX2
|
|
|
|
}
|
|
|
|
|
|
|
|
OPENSSL_INLINE int rsaz_avx2_preferred(void) {
|
|
|
|
const uint32_t *cap = OPENSSL_ia32cap_get();
|
|
|
|
static const uint32_t kBMI2AndADX = (1 << 8) | (1 << 19);
|
|
|
|
if ((cap[2] & kBMI2AndADX) == kBMI2AndADX) {
|
|
|
|
// If BMI2 and ADX are available, x86_64-mont5.pl is faster.
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return (cap[2] & (1 << 5)) != 0; // AVX2
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Assembly functions.
|
|
|
|
|
|
|
|
// RSAZ represents 1024-bit integers using unsaturated 29-bit limbs stored in
|
|
|
|
// 64-bit integers. This requires 36 limbs but padded up to 40.
|
|
|
|
//
|
|
|
|
// See crypto/bn/asm/rsaz-avx2.pl for further details.
|
|
|
|
|
|
|
|
// rsaz_1024_norm2red_avx2 converts |norm| from |BIGNUM| to RSAZ representation
|
|
|
|
// and writes the result to |red|.
|
|
|
|
void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]);
|
|
|
|
|
|
|
|
// rsaz_1024_mul_avx2 computes |a| * |b| mod |n| and writes the result to |ret|.
|
|
|
|
// Inputs and outputs are in Montgomery form, using RSAZ's representation. |k|
|
|
|
|
// is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
|
|
|
|
void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
|
|
|
|
const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k);
|
|
|
|
|
|
|
|
// rsaz_1024_mul_avx2 computes |a|^(2*|count|) mod |n| and writes the result to
|
|
|
|
// |ret|. Inputs and outputs are in Montgomery form, using RSAZ's
|
|
|
|
// representation. |k| is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
|
|
|
|
void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
|
|
|
|
const BN_ULONG n[40], BN_ULONG k, int count);
|
|
|
|
|
|
|
|
// rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be
|
|
|
|
// positive and at most 31. Note the table only uses 18 |BN_ULONG|s per entry
|
|
|
|
// instead of 40. It packs two 29-bit limbs into each |BN_ULONG| and only stores
|
|
|
|
// 36 limbs rather than the padded 40.
|
|
|
|
void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],
|
|
|
|
int i);
|
|
|
|
|
|
|
|
// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|.
|
|
|
|
void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],
|
|
|
|
int i);
|
|
|
|
|
|
|
|
// rsaz_1024_red2norm_avx2 converts |red| from RSAZ to |BIGNUM| representation
|
|
|
|
// and writes the result to |norm|.
|
|
|
|
void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]);
|
|
|
|
|
|
|
|
|
|
|
|
#endif // !OPENSSL_NO_ASM && OPENSSL_X86_64
|
|
|
|
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
} // extern "C"
|
|
|
|
#endif
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2018-02-13 20:14:30 +00:00
|
|
|
#endif // OPENSSL_HEADER_BN_RSAZ_EXP_H
|