Implement fe_sq2_tt with fe_sq_tt.
fiat-crypto only generates fe_mul and fe_sq, but the original Ed25519 implementation we had also had fe_sq2 for computing 2*f^2. Previously, we inlined a version of fe_mul. Instead, we could implement it with fe_sq and fe_add. Performance-wise, this seems to not regress. If anything, it makes it faster? Before (clang, run for 10 seconds): Did 243000 Ed25519 key generation operations in 10025910us (24237.2 ops/sec) Did 250000 Ed25519 signing operations in 10035580us (24911.4 ops/sec) Did 73305 Ed25519 verify operations in 10071101us (7278.7 ops/sec) Did 184000 Curve25519 base-point multiplication operations in 10040138us (18326.4 ops/sec) Did 186000 Curve25519 arbitrary point multiplication operations in 10052721us (18502.5 ops/sec) After (clang, run for 10 seconds): Did 242424 Ed25519 key generation operations in 10013117us (24210.6 ops/sec) Did 253000 Ed25519 signing operations in 10011744us (25270.3 ops/sec) Did 73899 Ed25519 verify operations in 10048040us (7354.6 ops/sec) Did 194000 Curve25519 base-point multiplication operations in 10005389us (19389.6 ops/sec) Did 195000 Curve25519 arbitrary point multiplication operations in 10028443us (19444.7 ops/sec) Before (clang + OPENSSL_SMALL, run for 10 seconds): Did 144000 Ed25519 key generation operations in 10019344us (14372.2 ops/sec) Did 146000 Ed25519 signing operations in 10011653us (14583.0 ops/sec) Did 74052 Ed25519 verify operations in 10005789us (7400.9 ops/sec) Did 150000 Curve25519 base-point multiplication operations in 10007468us (14988.8 ops/sec) Did 91392 Curve25519 arbitrary point multiplication operations in 10057678us (9086.8 ops/sec) After (clang + OPENSSL_SMALL, run for 10 seconds): Did 144000 Ed25519 key generation operations in 10066724us (14304.6 ops/sec) Did 148000 Ed25519 signing operations in 10062043us (14708.7 ops/sec) Did 74820 Ed25519 verify operations in 10058557us (7438.4 ops/sec) Did 151000 Curve25519 base-point multiplication operations in 10063492us (15004.7 ops/sec) Did 90402 Curve25519 arbitrary point multiplication operations in 10049141us (8996.0 ops/sec) Change-Id: I31e9f61833492c3ff2dfd78e1dee5e06f43c850f Reviewed-on: https://boringssl-review.googlesource.com/24724 Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
parent
a7bc94489f
commit
186df3a655
126
third_party/fiat/curve25519.c
vendored
126
third_party/fiat/curve25519.c
vendored
@ -820,126 +820,14 @@ static int fe_isnegative(const fe *f) {
|
||||
return s[0] & 1;
|
||||
}
|
||||
|
||||
// NOTE: based on fiat-crypto fe_mul, edited for in2=2*in1
|
||||
static void fe_sq2_impl(uint32_t out[10], const uint32_t in1[10]) {
|
||||
assert_fe_loose(in1);
|
||||
{ const uint32_t x20 = in1[9];
|
||||
{ const uint32_t x21 = in1[8];
|
||||
{ const uint32_t x19 = in1[7];
|
||||
{ const uint32_t x17 = in1[6];
|
||||
{ const uint32_t x15 = in1[5];
|
||||
{ const uint32_t x13 = in1[4];
|
||||
{ const uint32_t x11 = in1[3];
|
||||
{ const uint32_t x9 = in1[2];
|
||||
{ const uint32_t x7 = in1[1];
|
||||
{ const uint32_t x5 = in1[0];
|
||||
{ const uint32_t x38 = 2*in1[9];
|
||||
{ const uint32_t x39 = 2*in1[8];
|
||||
{ const uint32_t x37 = 2*in1[7];
|
||||
{ const uint32_t x35 = 2*in1[6];
|
||||
{ const uint32_t x33 = 2*in1[5];
|
||||
{ const uint32_t x31 = 2*in1[4];
|
||||
{ const uint32_t x29 = 2*in1[3];
|
||||
{ const uint32_t x27 = 2*in1[2];
|
||||
{ const uint32_t x25 = 2*in1[1];
|
||||
{ const uint32_t x23 = 2*in1[0];
|
||||
{ uint64_t x40 = ((uint64_t)x23 * x5);
|
||||
{ uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5));
|
||||
{ uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5));
|
||||
{ uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5));
|
||||
{ uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5));
|
||||
{ uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5));
|
||||
{ uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5));
|
||||
{ uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5));
|
||||
{ uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5));
|
||||
{ uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5));
|
||||
{ uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9));
|
||||
{ uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9));
|
||||
{ uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13));
|
||||
{ uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13));
|
||||
{ uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17));
|
||||
{ uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17));
|
||||
{ uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19))));
|
||||
{ uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21));
|
||||
{ uint64_t x58 = ((uint64_t)(0x2 * x38) * x20);
|
||||
{ uint64_t x59 = (x48 + (x58 << 0x4));
|
||||
{ uint64_t x60 = (x59 + (x58 << 0x1));
|
||||
{ uint64_t x61 = (x60 + x58);
|
||||
{ uint64_t x62 = (x47 + (x57 << 0x4));
|
||||
{ uint64_t x63 = (x62 + (x57 << 0x1));
|
||||
{ uint64_t x64 = (x63 + x57);
|
||||
{ uint64_t x65 = (x46 + (x56 << 0x4));
|
||||
{ uint64_t x66 = (x65 + (x56 << 0x1));
|
||||
{ uint64_t x67 = (x66 + x56);
|
||||
{ uint64_t x68 = (x45 + (x55 << 0x4));
|
||||
{ uint64_t x69 = (x68 + (x55 << 0x1));
|
||||
{ uint64_t x70 = (x69 + x55);
|
||||
{ uint64_t x71 = (x44 + (x54 << 0x4));
|
||||
{ uint64_t x72 = (x71 + (x54 << 0x1));
|
||||
{ uint64_t x73 = (x72 + x54);
|
||||
{ uint64_t x74 = (x43 + (x53 << 0x4));
|
||||
{ uint64_t x75 = (x74 + (x53 << 0x1));
|
||||
{ uint64_t x76 = (x75 + x53);
|
||||
{ uint64_t x77 = (x42 + (x52 << 0x4));
|
||||
{ uint64_t x78 = (x77 + (x52 << 0x1));
|
||||
{ uint64_t x79 = (x78 + x52);
|
||||
{ uint64_t x80 = (x41 + (x51 << 0x4));
|
||||
{ uint64_t x81 = (x80 + (x51 << 0x1));
|
||||
{ uint64_t x82 = (x81 + x51);
|
||||
{ uint64_t x83 = (x40 + (x50 << 0x4));
|
||||
{ uint64_t x84 = (x83 + (x50 << 0x1));
|
||||
{ uint64_t x85 = (x84 + x50);
|
||||
{ uint64_t x86 = (x85 >> 0x1a);
|
||||
{ uint32_t x87 = ((uint32_t)x85 & 0x3ffffff);
|
||||
{ uint64_t x88 = (x86 + x82);
|
||||
{ uint64_t x89 = (x88 >> 0x19);
|
||||
{ uint32_t x90 = ((uint32_t)x88 & 0x1ffffff);
|
||||
{ uint64_t x91 = (x89 + x79);
|
||||
{ uint64_t x92 = (x91 >> 0x1a);
|
||||
{ uint32_t x93 = ((uint32_t)x91 & 0x3ffffff);
|
||||
{ uint64_t x94 = (x92 + x76);
|
||||
{ uint64_t x95 = (x94 >> 0x19);
|
||||
{ uint32_t x96 = ((uint32_t)x94 & 0x1ffffff);
|
||||
{ uint64_t x97 = (x95 + x73);
|
||||
{ uint64_t x98 = (x97 >> 0x1a);
|
||||
{ uint32_t x99 = ((uint32_t)x97 & 0x3ffffff);
|
||||
{ uint64_t x100 = (x98 + x70);
|
||||
{ uint64_t x101 = (x100 >> 0x19);
|
||||
{ uint32_t x102 = ((uint32_t)x100 & 0x1ffffff);
|
||||
{ uint64_t x103 = (x101 + x67);
|
||||
{ uint64_t x104 = (x103 >> 0x1a);
|
||||
{ uint32_t x105 = ((uint32_t)x103 & 0x3ffffff);
|
||||
{ uint64_t x106 = (x104 + x64);
|
||||
{ uint64_t x107 = (x106 >> 0x19);
|
||||
{ uint32_t x108 = ((uint32_t)x106 & 0x1ffffff);
|
||||
{ uint64_t x109 = (x107 + x61);
|
||||
{ uint64_t x110 = (x109 >> 0x1a);
|
||||
{ uint32_t x111 = ((uint32_t)x109 & 0x3ffffff);
|
||||
{ uint64_t x112 = (x110 + x49);
|
||||
{ uint64_t x113 = (x112 >> 0x19);
|
||||
{ uint32_t x114 = ((uint32_t)x112 & 0x1ffffff);
|
||||
{ uint64_t x115 = (x87 + (0x13 * x113));
|
||||
{ uint32_t x116 = (uint32_t) (x115 >> 0x1a);
|
||||
{ uint32_t x117 = ((uint32_t)x115 & 0x3ffffff);
|
||||
{ uint32_t x118 = (x116 + x90);
|
||||
{ uint32_t x119 = (x118 >> 0x19);
|
||||
{ uint32_t x120 = (x118 & 0x1ffffff);
|
||||
out[0] = x117;
|
||||
out[1] = x120;
|
||||
out[2] = (x119 + x93);
|
||||
out[3] = x96;
|
||||
out[4] = x99;
|
||||
out[5] = x102;
|
||||
out[6] = x105;
|
||||
out[7] = x108;
|
||||
out[8] = x111;
|
||||
out[9] = x114;
|
||||
}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
|
||||
assert_fe(out);
|
||||
}
|
||||
|
||||
static void fe_sq2_tt(fe *h, const fe *f) {
|
||||
fe_sq2_impl(h->v, f->v);
|
||||
// h = f^2
|
||||
fe_sq_tt(h, f);
|
||||
|
||||
// h = h + h
|
||||
fe_loose tmp;
|
||||
fe_add(&tmp, h, h);
|
||||
fe_carry(h, &tmp);
|
||||
}
|
||||
|
||||
static void fe_pow22523(fe *out, const fe *z) {
|
||||
|
Loading…
Reference in New Issue
Block a user