Implement fe_sq2_tt with fe_sq_tt.

fiat-crypto only generates fe_mul and fe_sq, but the original Ed25519
implementation we had also had fe_sq2 for computing 2*f^2. Previously,
we inlined a version of fe_mul.

Instead, we could implement it with fe_sq and fe_add. Performance-wise,
this seems to not regress. If anything, it makes it faster?

Before (clang, run for 10 seconds):
Did 243000 Ed25519 key generation operations in 10025910us (24237.2 ops/sec)
Did 250000 Ed25519 signing operations in 10035580us (24911.4 ops/sec)
Did 73305 Ed25519 verify operations in 10071101us (7278.7 ops/sec)
Did 184000 Curve25519 base-point multiplication operations in 10040138us (18326.4 ops/sec)
Did 186000 Curve25519 arbitrary point multiplication operations in 10052721us (18502.5 ops/sec)

After (clang, run for 10 seconds):
Did 242424 Ed25519 key generation operations in 10013117us (24210.6 ops/sec)
Did 253000 Ed25519 signing operations in 10011744us (25270.3 ops/sec)
Did 73899 Ed25519 verify operations in 10048040us (7354.6 ops/sec)
Did 194000 Curve25519 base-point multiplication operations in 10005389us (19389.6 ops/sec)
Did 195000 Curve25519 arbitrary point multiplication operations in 10028443us (19444.7 ops/sec)

Before (clang + OPENSSL_SMALL, run for 10 seconds):
Did 144000 Ed25519 key generation operations in 10019344us (14372.2 ops/sec)
Did 146000 Ed25519 signing operations in 10011653us (14583.0 ops/sec)
Did 74052 Ed25519 verify operations in 10005789us (7400.9 ops/sec)
Did 150000 Curve25519 base-point multiplication operations in 10007468us (14988.8 ops/sec)
Did 91392 Curve25519 arbitrary point multiplication operations in 10057678us (9086.8 ops/sec)

After (clang + OPENSSL_SMALL, run for 10 seconds):
Did 144000 Ed25519 key generation operations in 10066724us (14304.6 ops/sec)
Did 148000 Ed25519 signing operations in 10062043us (14708.7 ops/sec)
Did 74820 Ed25519 verify operations in 10058557us (7438.4 ops/sec)
Did 151000 Curve25519 base-point multiplication operations in 10063492us (15004.7 ops/sec)
Did 90402 Curve25519 arbitrary point multiplication operations in 10049141us (8996.0 ops/sec)

Change-Id: I31e9f61833492c3ff2dfd78e1dee5e06f43c850f
Reviewed-on: https://boringssl-review.googlesource.com/24724
Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
David Benjamin 2018-01-09 10:35:50 -05:00 committed by Adam Langley
parent a7bc94489f
commit 186df3a655

View File

@ -820,126 +820,14 @@ static int fe_isnegative(const fe *f) {
return s[0] & 1;
}
// NOTE: based on fiat-crypto fe_mul, edited for in2=2*in1
static void fe_sq2_impl(uint32_t out[10], const uint32_t in1[10]) {
assert_fe_loose(in1);
{ const uint32_t x20 = in1[9];
{ const uint32_t x21 = in1[8];
{ const uint32_t x19 = in1[7];
{ const uint32_t x17 = in1[6];
{ const uint32_t x15 = in1[5];
{ const uint32_t x13 = in1[4];
{ const uint32_t x11 = in1[3];
{ const uint32_t x9 = in1[2];
{ const uint32_t x7 = in1[1];
{ const uint32_t x5 = in1[0];
{ const uint32_t x38 = 2*in1[9];
{ const uint32_t x39 = 2*in1[8];
{ const uint32_t x37 = 2*in1[7];
{ const uint32_t x35 = 2*in1[6];
{ const uint32_t x33 = 2*in1[5];
{ const uint32_t x31 = 2*in1[4];
{ const uint32_t x29 = 2*in1[3];
{ const uint32_t x27 = 2*in1[2];
{ const uint32_t x25 = 2*in1[1];
{ const uint32_t x23 = 2*in1[0];
{ uint64_t x40 = ((uint64_t)x23 * x5);
{ uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5));
{ uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5));
{ uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5));
{ uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5));
{ uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5));
{ uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5));
{ uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5));
{ uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5));
{ uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5));
{ uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9));
{ uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9));
{ uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13));
{ uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13));
{ uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17));
{ uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17));
{ uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19))));
{ uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21));
{ uint64_t x58 = ((uint64_t)(0x2 * x38) * x20);
{ uint64_t x59 = (x48 + (x58 << 0x4));
{ uint64_t x60 = (x59 + (x58 << 0x1));
{ uint64_t x61 = (x60 + x58);
{ uint64_t x62 = (x47 + (x57 << 0x4));
{ uint64_t x63 = (x62 + (x57 << 0x1));
{ uint64_t x64 = (x63 + x57);
{ uint64_t x65 = (x46 + (x56 << 0x4));
{ uint64_t x66 = (x65 + (x56 << 0x1));
{ uint64_t x67 = (x66 + x56);
{ uint64_t x68 = (x45 + (x55 << 0x4));
{ uint64_t x69 = (x68 + (x55 << 0x1));
{ uint64_t x70 = (x69 + x55);
{ uint64_t x71 = (x44 + (x54 << 0x4));
{ uint64_t x72 = (x71 + (x54 << 0x1));
{ uint64_t x73 = (x72 + x54);
{ uint64_t x74 = (x43 + (x53 << 0x4));
{ uint64_t x75 = (x74 + (x53 << 0x1));
{ uint64_t x76 = (x75 + x53);
{ uint64_t x77 = (x42 + (x52 << 0x4));
{ uint64_t x78 = (x77 + (x52 << 0x1));
{ uint64_t x79 = (x78 + x52);
{ uint64_t x80 = (x41 + (x51 << 0x4));
{ uint64_t x81 = (x80 + (x51 << 0x1));
{ uint64_t x82 = (x81 + x51);
{ uint64_t x83 = (x40 + (x50 << 0x4));
{ uint64_t x84 = (x83 + (x50 << 0x1));
{ uint64_t x85 = (x84 + x50);
{ uint64_t x86 = (x85 >> 0x1a);
{ uint32_t x87 = ((uint32_t)x85 & 0x3ffffff);
{ uint64_t x88 = (x86 + x82);
{ uint64_t x89 = (x88 >> 0x19);
{ uint32_t x90 = ((uint32_t)x88 & 0x1ffffff);
{ uint64_t x91 = (x89 + x79);
{ uint64_t x92 = (x91 >> 0x1a);
{ uint32_t x93 = ((uint32_t)x91 & 0x3ffffff);
{ uint64_t x94 = (x92 + x76);
{ uint64_t x95 = (x94 >> 0x19);
{ uint32_t x96 = ((uint32_t)x94 & 0x1ffffff);
{ uint64_t x97 = (x95 + x73);
{ uint64_t x98 = (x97 >> 0x1a);
{ uint32_t x99 = ((uint32_t)x97 & 0x3ffffff);
{ uint64_t x100 = (x98 + x70);
{ uint64_t x101 = (x100 >> 0x19);
{ uint32_t x102 = ((uint32_t)x100 & 0x1ffffff);
{ uint64_t x103 = (x101 + x67);
{ uint64_t x104 = (x103 >> 0x1a);
{ uint32_t x105 = ((uint32_t)x103 & 0x3ffffff);
{ uint64_t x106 = (x104 + x64);
{ uint64_t x107 = (x106 >> 0x19);
{ uint32_t x108 = ((uint32_t)x106 & 0x1ffffff);
{ uint64_t x109 = (x107 + x61);
{ uint64_t x110 = (x109 >> 0x1a);
{ uint32_t x111 = ((uint32_t)x109 & 0x3ffffff);
{ uint64_t x112 = (x110 + x49);
{ uint64_t x113 = (x112 >> 0x19);
{ uint32_t x114 = ((uint32_t)x112 & 0x1ffffff);
{ uint64_t x115 = (x87 + (0x13 * x113));
{ uint32_t x116 = (uint32_t) (x115 >> 0x1a);
{ uint32_t x117 = ((uint32_t)x115 & 0x3ffffff);
{ uint32_t x118 = (x116 + x90);
{ uint32_t x119 = (x118 >> 0x19);
{ uint32_t x120 = (x118 & 0x1ffffff);
out[0] = x117;
out[1] = x120;
out[2] = (x119 + x93);
out[3] = x96;
out[4] = x99;
out[5] = x102;
out[6] = x105;
out[7] = x108;
out[8] = x111;
out[9] = x114;
}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
assert_fe(out);
}
static void fe_sq2_tt(fe *h, const fe *f) {
fe_sq2_impl(h->v, f->v);
// h = f^2
fe_sq_tt(h, f);
// h = h + h
fe_loose tmp;
fe_add(&tmp, h, h);
fe_carry(h, &tmp);
}
static void fe_pow22523(fe *out, const fe *z) {