Addition was not preserving inputs' property of being fully reduced.
Thanks to Brian Smith for reporting this.
(Imported from upstream's b62b2454fa
and
d3034d31e7c04b334dd245504dd4f56e513ca115.)
See also this thread.
https://mta.openssl.org/pipermail/openssl-dev/2016-August/008179.html
Change-Id: I3731f949e2e2ef539dec656c58f1820cc09a56a6
Reviewed-on: https://boringssl-review.googlesource.com/11409
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
kris/onging/CECPQ3_patch15
@@ -94,6 +94,7 @@ ecp_nistz256_mul_by_2: | |||
push %r13 | |||
mov 8*0($a_ptr), $a0 | |||
xor $t4,$t4 | |||
mov 8*1($a_ptr), $a1 | |||
add $a0, $a0 # a0:a3+a0:a3 | |||
mov 8*2($a_ptr), $a2 | |||
@@ -104,7 +105,7 @@ ecp_nistz256_mul_by_2: | |||
adc $a2, $a2 | |||
adc $a3, $a3 | |||
mov $a1, $t1 | |||
sbb $t4, $t4 | |||
adc \$0, $t4 | |||
sub 8*0($a_ptr), $a0 | |||
mov $a2, $t2 | |||
@@ -112,14 +113,14 @@ ecp_nistz256_mul_by_2: | |||
sbb 8*2($a_ptr), $a2 | |||
mov $a3, $t3 | |||
sbb 8*3($a_ptr), $a3 | |||
test $t4, $t4 | |||
sbb \$0, $t4 | |||
cmovz $t0, $a0 | |||
cmovz $t1, $a1 | |||
cmovc $t0, $a0 | |||
cmovc $t1, $a1 | |||
mov $a0, 8*0($r_ptr) | |||
cmovz $t2, $a2 | |||
cmovc $t2, $a2 | |||
mov $a1, 8*1($r_ptr) | |||
cmovz $t3, $a3 | |||
cmovc $t3, $a3 | |||
mov $a2, 8*2($r_ptr) | |||
mov $a3, 8*3($r_ptr) | |||
@@ -1570,13 +1571,14 @@ $code.=<<___; | |||
.type __ecp_nistz256_add_toq,\@abi-omnipotent | |||
.align 32 | |||
__ecp_nistz256_add_toq: | |||
xor $t4,$t4 | |||
add 8*0($b_ptr), $a0 | |||
adc 8*1($b_ptr), $a1 | |||
mov $a0, $t0 | |||
adc 8*2($b_ptr), $a2 | |||
adc 8*3($b_ptr), $a3 | |||
mov $a1, $t1 | |||
sbb $t4, $t4 | |||
adc \$0, $t4 | |||
sub \$-1, $a0 | |||
mov $a2, $t2 | |||
@@ -1584,14 +1586,14 @@ __ecp_nistz256_add_toq: | |||
sbb \$0, $a2 | |||
mov $a3, $t3 | |||
sbb $poly3, $a3 | |||
test $t4, $t4 | |||
sbb \$0, $t4 | |||
cmovz $t0, $a0 | |||
cmovz $t1, $a1 | |||
cmovc $t0, $a0 | |||
cmovc $t1, $a1 | |||
mov $a0, 8*0($r_ptr) | |||
cmovz $t2, $a2 | |||
cmovc $t2, $a2 | |||
mov $a1, 8*1($r_ptr) | |||
cmovz $t3, $a3 | |||
cmovc $t3, $a3 | |||
mov $a2, 8*2($r_ptr) | |||
mov $a3, 8*3($r_ptr) | |||
@@ -1659,13 +1661,14 @@ __ecp_nistz256_subq: | |||
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent | |||
.align 32 | |||
__ecp_nistz256_mul_by_2q: | |||
xor $t4, $t4 | |||
add $a0, $a0 # a0:a3+a0:a3 | |||
adc $a1, $a1 | |||
mov $a0, $t0 | |||
adc $a2, $a2 | |||
adc $a3, $a3 | |||
mov $a1, $t1 | |||
sbb $t4, $t4 | |||
adc \$0, $t4 | |||
sub \$-1, $a0 | |||
mov $a2, $t2 | |||
@@ -1673,14 +1676,14 @@ __ecp_nistz256_mul_by_2q: | |||
sbb \$0, $a2 | |||
mov $a3, $t3 | |||
sbb $poly3, $a3 | |||
test $t4, $t4 | |||
sbb \$0, $t4 | |||
cmovz $t0, $a0 | |||
cmovz $t1, $a1 | |||
cmovc $t0, $a0 | |||
cmovc $t1, $a1 | |||
mov $a0, 8*0($r_ptr) | |||
cmovz $t2, $a2 | |||
cmovc $t2, $a2 | |||
mov $a1, 8*1($r_ptr) | |||
cmovz $t3, $a3 | |||
cmovc $t3, $a3 | |||
mov $a2, 8*2($r_ptr) | |||
mov $a3, 8*3($r_ptr) | |||
@@ -2135,6 +2138,7 @@ $code.=<<___; | |||
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | |||
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | |||
xor $t4, $t4 | |||
add $acc0, $acc0 # a0:a3+a0:a3 | |||
lea $Rsqr(%rsp), $a_ptr | |||
adc $acc1, $acc1 | |||
@@ -2142,7 +2146,7 @@ $code.=<<___; | |||
adc $acc2, $acc2 | |||
adc $acc3, $acc3 | |||
mov $acc1, $t1 | |||
sbb $t4, $t4 | |||
adc \$0, $t4 | |||
sub \$-1, $acc0 | |||
mov $acc2, $t2 | |||
@@ -2150,15 +2154,15 @@ $code.=<<___; | |||
sbb \$0, $acc2 | |||
mov $acc3, $t3 | |||
sbb $poly3, $acc3 | |||
test $t4, $t4 | |||
sbb \$0, $t4 | |||
cmovz $t0, $acc0 | |||
cmovc $t0, $acc0 | |||
mov 8*0($a_ptr), $t0 | |||
cmovz $t1, $acc1 | |||
cmovc $t1, $acc1 | |||
mov 8*1($a_ptr), $t1 | |||
cmovz $t2, $acc2 | |||
cmovc $t2, $acc2 | |||
mov 8*2($a_ptr), $t2 | |||
cmovz $t3, $acc3 | |||
cmovc $t3, $acc3 | |||
mov 8*3($a_ptr), $t3 | |||
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | |||
@@ -2440,6 +2444,7 @@ $code.=<<___; | |||
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 | |||
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); | |||
xor $t4, $t4 | |||
add $acc0, $acc0 # a0:a3+a0:a3 | |||
lea $Rsqr(%rsp), $a_ptr | |||
adc $acc1, $acc1 | |||
@@ -2447,7 +2452,7 @@ $code.=<<___; | |||
adc $acc2, $acc2 | |||
adc $acc3, $acc3 | |||
mov $acc1, $t1 | |||
sbb $t4, $t4 | |||
adc \$0, $t4 | |||
sub \$-1, $acc0 | |||
mov $acc2, $t2 | |||
@@ -2455,15 +2460,15 @@ $code.=<<___; | |||
sbb \$0, $acc2 | |||
mov $acc3, $t3 | |||
sbb $poly3, $acc3 | |||
test $t4, $t4 | |||
sbb \$0, $t4 | |||
cmovz $t0, $acc0 | |||
cmovc $t0, $acc0 | |||
mov 8*0($a_ptr), $t0 | |||
cmovz $t1, $acc1 | |||
cmovc $t1, $acc1 | |||
mov 8*1($a_ptr), $t1 | |||
cmovz $t2, $acc2 | |||
cmovc $t2, $acc2 | |||
mov 8*2($a_ptr), $t2 | |||
cmovz $t3, $acc3 | |||
cmovc $t3, $acc3 | |||
mov 8*3($a_ptr), $t3 | |||
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); | |||
@@ -2615,14 +2620,14 @@ __ecp_nistz256_add_tox: | |||
sbb \$0, $a2 | |||
mov $a3, $t3 | |||
sbb $poly3, $a3 | |||
sbb \$0, $t4 | |||
bt \$0, $t4 | |||
cmovnc $t0, $a0 | |||
cmovnc $t1, $a1 | |||
cmovc $t0, $a0 | |||
cmovc $t1, $a1 | |||
mov $a0, 8*0($r_ptr) | |||
cmovnc $t2, $a2 | |||
cmovc $t2, $a2 | |||
mov $a1, 8*1($r_ptr) | |||
cmovnc $t3, $a3 | |||
cmovc $t3, $a3 | |||
mov $a2, 8*2($r_ptr) | |||
mov $a3, 8*3($r_ptr) | |||
@@ -2710,14 +2715,14 @@ __ecp_nistz256_mul_by_2x: | |||
sbb \$0, $a2 | |||
mov $a3, $t3 | |||
sbb $poly3, $a3 | |||
sbb \$0, $t4 | |||
bt \$0, $t4 | |||
cmovnc $t0, $a0 | |||
cmovnc $t1, $a1 | |||
cmovc $t0, $a0 | |||
cmovc $t1, $a1 | |||
mov $a0, 8*0($r_ptr) | |||
cmovnc $t2, $a2 | |||
cmovc $t2, $a2 | |||
mov $a1, 8*1($r_ptr) | |||
cmovnc $t3, $a3 | |||
cmovc $t3, $a3 | |||
mov $a2, 8*2($r_ptr) | |||
mov $a3, 8*3($r_ptr) | |||
@@ -54,24 +54,16 @@ typedef struct { | |||
typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; | |||
/* Arithmetic on field elements using Almost Montgomery Multiplication. The | |||
* "almost" means, in particular, that the inputs and outputs of these | |||
* functions are in the range [0, 2**BN_BITS2), not [0, P). Only | |||
* |ecp_nistz256_from_mont| outputs a fully reduced value in [0, P). Almost | |||
* Montgomery Arithmetic is described clearly in "Efficient Software | |||
* Implementations of Modular Exponentiation" by Shay Gueron. */ | |||
/* Modular neg: res = -a mod P, where res is not fully reduced. */ | |||
/* Modular neg: res = -a mod P. */ | |||
void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); | |||
/* Montgomery mul: res = a*b*2^-256 mod P, where res is not fully reduced. */ | |||
/* Montgomery mul: res = a*b*2^-256 mod P. */ | |||
void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], | |||
const BN_ULONG a[P256_LIMBS], | |||
const BN_ULONG b[P256_LIMBS]); | |||
/* Montgomery sqr: res = a*a*2^-256 mod P, where res is not fully reduced. */ | |||
/* Montgomery sqr: res = a*a*2^-256 mod P. */ | |||
void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], | |||
const BN_ULONG a[P256_LIMBS]); | |||
/* Convert a number from Montgomery domain, by multiplying with 1, where res | |||
* will be fully reduced mod P. */ | |||
/* Convert a number from Montgomery domain, by multiplying with 1. */ | |||
void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], | |||
const BN_ULONG in[P256_LIMBS]); | |||
@@ -527,10 +519,8 @@ static int ecp_nistz256_get_affine(const EC_GROUP *group, const EC_POINT *point, | |||
ecp_nistz256_mod_inverse(z_inv3, point_z); | |||
ecp_nistz256_sqr_mont(z_inv2, z_inv3); | |||
/* Unlike the |BN_mod_mul_montgomery|-based implementation, we cannot factor | |||
* out the two calls to |ecp_nistz256_from_mont| into one call, because | |||
* |ecp_nistz256_from_mont| must be the last operation to ensure that the | |||
* result is fully reduced mod P. */ | |||
/* TODO(davidben): The two calls to |ecp_nistz256_from_mont| may be factored | |||
* into one call now that other operations also reduce mod P. */ | |||
if (x != NULL) { | |||
BN_ULONG x_aff[P256_LIMBS]; | |||