diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c index b07111e0..5187f4a1 100644 --- a/crypto/fipsmodule/bn/exponentiation.c +++ b/crypto/fipsmodule/bn/exponentiation.c @@ -914,9 +914,6 @@ static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx, return 1; } -// BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache -// line width of the target processor is at least the following value. -#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH (64) #define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK \ (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1) @@ -1004,6 +1001,14 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, // implementation assumes it can use |top| to size R. int top = mont->N.width; +#if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) + // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code + // paths. If we were to use separate static buffers for each then there is + // some chance that both large buffers would be allocated on the stack, + // causing the stack space requirement to be truly huge (~10KB). + alignas(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH) BN_ULONG + storage[MOD_EXP_CTIME_STORAGE_LEN]; +#endif #ifdef RSAZ_ENABLED // If the size of the operands allow it, perform the optimized // RSAZ exponentiation. For further information see @@ -1013,7 +1018,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (!bn_wexpand(rr, 16)) { goto err; } - RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]); + RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], + storage); rr->width = 16; rr->neg = 0; ret = 1; @@ -1037,27 +1043,24 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, powerbufLen += sizeof(m->d[0]) * (top * numPowers + ((2 * top) > numPowers ? (2 * top) : numPowers)); -#ifdef alloca - if (powerbufLen < 3072) { - powerbufFree = alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); - } else + +#if defined(OPENSSL_BN_ASM_MONT5) + if ((size_t)powerbufLen <= sizeof(storage)) { + powerbuf = (unsigned char *)storage; + } + // |storage| is more than large enough to handle 1024-bit inputs. + assert(powerbuf != NULL || top * BN_BITS2 > 1024); #endif - { - if ((powerbufFree = OPENSSL_malloc( - powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) { + if (powerbuf == NULL) { + powerbufFree = + OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); + if (powerbufFree == NULL) { goto err; } + powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); } - - powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); OPENSSL_memset(powerbuf, 0, powerbufLen); -#ifdef alloca - if (powerbufLen < 3072) { - powerbufFree = NULL; - } -#endif - // lay down tmp and am right after powers table tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers); am.d = tmp.d + top; @@ -1264,6 +1267,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, err: BN_MONT_CTX_free(new_mont); + if (powerbuf != NULL && powerbufFree == NULL) { + OPENSSL_cleanse(powerbuf, powerbufLen); + } OPENSSL_free(powerbufFree); return (ret); } diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h index ab2a9256..9796831f 100644 --- a/crypto/fipsmodule/bn/internal.h +++ b/crypto/fipsmodule/bn/internal.h @@ -185,6 +185,16 @@ extern "C" { #error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT" #endif +// |BN_mod_exp_mont_consttime| is based on the assumption that the L1 data +// cache line width of the target processor is at least the following value. +#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH 64 + +// The number of |BN_ULONG|s needed for the |BN_mod_exp_mont_consttime| stack- +// allocated storage buffer. The buffer is just the right size for the RSAZ +// and is about ~1KB larger than what's necessary (4480 bytes) for 1024-bit +// inputs. +#define MOD_EXP_CTIME_STORAGE_LEN \ + (((320u * 3u) + (32u * 9u * 16u)) / sizeof(BN_ULONG)) #define STATIC_BIGNUM(x) \ { \ diff --git a/crypto/fipsmodule/bn/rsaz_exp.c b/crypto/fipsmodule/bn/rsaz_exp.c index 97c58bad..79a04e2e 100644 --- a/crypto/fipsmodule/bn/rsaz_exp.c +++ b/crypto/fipsmodule/bn/rsaz_exp.c @@ -45,8 +45,13 @@ alignas(64) static const BN_ULONG two80[40] = { void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], const BN_ULONG base_norm[16], const BN_ULONG exponent[16], - const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) { - alignas(64) uint8_t storage[(320 * 3) + (32 * 9 * 16)]; // 5.5KB + const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0, + BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]) { + OPENSSL_COMPILE_ASSERT(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH % 64 == 0, + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH_is_large_enough); + unsigned char *storage = (unsigned char *)storage_words; + assert((uintptr_t)storage % 64 == 0); + unsigned char *a_inv, *m, *result, *table_s = storage + (320 * 3), *R2 = table_s; // borrow if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) { diff --git a/crypto/fipsmodule/bn/rsaz_exp.h b/crypto/fipsmodule/bn/rsaz_exp.h index af973a5c..4819fdcd 100644 --- a/crypto/fipsmodule/bn/rsaz_exp.h +++ b/crypto/fipsmodule/bn/rsaz_exp.h @@ -20,11 +20,14 @@ // RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent| // modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have // the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|, -// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. +// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a +// temporary buffer that must be aligned to |MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH| +// bytes. void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16], const BN_ULONG exponent[16], const BN_ULONG m_norm[16], const BN_ULONG RR[16], - BN_ULONG k0); + BN_ULONG k0, + BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]); // rsaz_avx2_eligible returns one if |RSAZ_1024_mod_exp_avx2| should be used and // zero otherwise.