Browse Source

Perform stricter reduction in p256-x86_64-asm.pl.

Addition was not preserving inputs' property of being fully reduced.

Thanks to Brian Smith for reporting this.

(Imported from upstream's b62b2454fa and
d3034d31e7c04b334dd245504dd4f56e513ca115.)

See also this thread.
https://mta.openssl.org/pipermail/openssl-dev/2016-August/008179.html

Change-Id: I3731f949e2e2ef539dec656c58f1820cc09a56a6
Reviewed-on: https://boringssl-review.googlesource.com/11409
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
kris/onging/CECPQ3_patch15
David Benjamin 8 years ago
committed by Adam Langley
parent
commit
28d1dc8c51
2 changed files with 51 additions and 56 deletions
  1. +45
    -40
      crypto/ec/asm/p256-x86_64-asm.pl
  2. +6
    -16
      crypto/ec/p256-x86_64.c

+ 45
- 40
crypto/ec/asm/p256-x86_64-asm.pl View File

@@ -94,6 +94,7 @@ ecp_nistz256_mul_by_2:
push %r13

mov 8*0($a_ptr), $a0
xor $t4,$t4
mov 8*1($a_ptr), $a1
add $a0, $a0 # a0:a3+a0:a3
mov 8*2($a_ptr), $a2
@@ -104,7 +105,7 @@ ecp_nistz256_mul_by_2:
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
sbb $t4, $t4
adc \$0, $t4

sub 8*0($a_ptr), $a0
mov $a2, $t2
@@ -112,14 +113,14 @@ ecp_nistz256_mul_by_2:
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
test $t4, $t4
sbb \$0, $t4

cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)

@@ -1570,13 +1571,14 @@ $code.=<<___;
.type __ecp_nistz256_add_toq,\@abi-omnipotent
.align 32
__ecp_nistz256_add_toq:
xor $t4,$t4
add 8*0($b_ptr), $a0
adc 8*1($b_ptr), $a1
mov $a0, $t0
adc 8*2($b_ptr), $a2
adc 8*3($b_ptr), $a3
mov $a1, $t1
sbb $t4, $t4
adc \$0, $t4

sub \$-1, $a0
mov $a2, $t2
@@ -1584,14 +1586,14 @@ __ecp_nistz256_add_toq:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
test $t4, $t4
sbb \$0, $t4

cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)

@@ -1659,13 +1661,14 @@ __ecp_nistz256_subq:
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
.align 32
__ecp_nistz256_mul_by_2q:
xor $t4, $t4
add $a0, $a0 # a0:a3+a0:a3
adc $a1, $a1
mov $a0, $t0
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
sbb $t4, $t4
adc \$0, $t4

sub \$-1, $a0
mov $a2, $t2
@@ -1673,14 +1676,14 @@ __ecp_nistz256_mul_by_2q:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
test $t4, $t4
sbb \$0, $t4

cmovz $t0, $a0
cmovz $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovz $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovz $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)

@@ -2135,6 +2138,7 @@ $code.=<<___;
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);

xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
@@ -2142,7 +2146,7 @@ $code.=<<___;
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
sbb $t4, $t4
adc \$0, $t4

sub \$-1, $acc0
mov $acc2, $t2
@@ -2150,15 +2154,15 @@ $code.=<<___;
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
test $t4, $t4
sbb \$0, $t4

cmovz $t0, $acc0
cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
cmovz $t1, $acc1
cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
cmovz $t2, $acc2
cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
cmovz $t3, $acc3
cmovc $t3, $acc3
mov 8*3($a_ptr), $t3

call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
@@ -2440,6 +2444,7 @@ $code.=<<___;
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);

xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
@@ -2447,7 +2452,7 @@ $code.=<<___;
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
sbb $t4, $t4
adc \$0, $t4

sub \$-1, $acc0
mov $acc2, $t2
@@ -2455,15 +2460,15 @@ $code.=<<___;
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
test $t4, $t4
sbb \$0, $t4

cmovz $t0, $acc0
cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
cmovz $t1, $acc1
cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
cmovz $t2, $acc2
cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
cmovz $t3, $acc3
cmovc $t3, $acc3
mov 8*3($a_ptr), $t3

call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
@@ -2615,14 +2620,14 @@ __ecp_nistz256_add_tox:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
sbb \$0, $t4

bt \$0, $t4
cmovnc $t0, $a0
cmovnc $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovnc $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovnc $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)

@@ -2710,14 +2715,14 @@ __ecp_nistz256_mul_by_2x:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
sbb \$0, $t4

bt \$0, $t4
cmovnc $t0, $a0
cmovnc $t1, $a1
cmovc $t0, $a0
cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
cmovnc $t2, $a2
cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
cmovnc $t3, $a3
cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)



+ 6
- 16
crypto/ec/p256-x86_64.c View File

@@ -54,24 +54,16 @@ typedef struct {

typedef P256_POINT_AFFINE PRECOMP256_ROW[64];

/* Arithmetic on field elements using Almost Montgomery Multiplication. The
* "almost" means, in particular, that the inputs and outputs of these
* functions are in the range [0, 2**BN_BITS2), not [0, P). Only
* |ecp_nistz256_from_mont| outputs a fully reduced value in [0, P). Almost
* Montgomery Arithmetic is described clearly in "Efficient Software
* Implementations of Modular Exponentiation" by Shay Gueron. */

/* Modular neg: res = -a mod P, where res is not fully reduced. */
/* Modular neg: res = -a mod P. */
void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
/* Montgomery mul: res = a*b*2^-256 mod P, where res is not fully reduced. */
/* Montgomery mul: res = a*b*2^-256 mod P. */
void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
/* Montgomery sqr: res = a*a*2^-256 mod P, where res is not fully reduced. */
/* Montgomery sqr: res = a*a*2^-256 mod P. */
void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
/* Convert a number from Montgomery domain, by multiplying with 1, where res
* will be fully reduced mod P. */
/* Convert a number from Montgomery domain, by multiplying with 1. */
void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG in[P256_LIMBS]);

@@ -527,10 +519,8 @@ static int ecp_nistz256_get_affine(const EC_GROUP *group, const EC_POINT *point,
ecp_nistz256_mod_inverse(z_inv3, point_z);
ecp_nistz256_sqr_mont(z_inv2, z_inv3);

/* Unlike the |BN_mod_mul_montgomery|-based implementation, we cannot factor
* out the two calls to |ecp_nistz256_from_mont| into one call, because
* |ecp_nistz256_from_mont| must be the last operation to ensure that the
* result is fully reduced mod P. */
/* TODO(davidben): The two calls to |ecp_nistz256_from_mont| may be factored
* into one call now that other operations also reduce mod P. */

if (x != NULL) {
BN_ULONG x_aff[P256_LIMBS];


Loading…
Cancel
Save