Remove the arch-specific HOST_c2l/HOST_l2c implementations.

These do not appear to have much discernable effect on performance. Three
comparison runs:

Before:
Did 5414000 SHA-1 (16 bytes) operations in 1000009us (5413951.3 ops/sec): 86.6 MB/s
Did 1607000 SHA-1 (256 bytes) operations in 1000403us (1606352.6 ops/sec): 411.2 MB/s
Did 70000 SHA-1 (8192 bytes) operations in 1014426us (69004.5 ops/sec): 565.3 MB/s
Did 2991000 SHA-256 (16 bytes) operations in 1000204us (2990390.0 ops/sec): 47.8 MB/s
Did 741000 SHA-256 (256 bytes) operations in 1000371us (740725.2 ops/sec): 189.6 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019327us (30412.2 ops/sec): 249.1 MB/s
Did 2340000 SHA-512 (16 bytes) operations in 1000312us (2339270.1 ops/sec): 37.4 MB/s
Did 880000 SHA-512 (256 bytes) operations in 1000879us (879227.2 ops/sec): 225.1 MB/s
Did 44000 SHA-512 (8192 bytes) operations in 1013355us (43420.1 ops/sec): 355.7 MB/s
After:
Did 5259000 SHA-1 (16 bytes) operations in 1000013us (5258931.6 ops/sec): 84.1 MB/s
Did 1547000 SHA-1 (256 bytes) operations in 1000011us (1546983.0 ops/sec): 396.0 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1001089us (68924.9 ops/sec): 564.6 MB/s
Did 2984000 SHA-256 (16 bytes) operations in 1000207us (2983382.4 ops/sec): 47.7 MB/s
Did 734000 SHA-256 (256 bytes) operations in 1000317us (733767.4 ops/sec): 187.8 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1021065us (30360.5 ops/sec): 248.7 MB/s
Did 2324000 SHA-512 (16 bytes) operations in 1000116us (2323730.4 ops/sec): 37.2 MB/s
Did 828000 SHA-512 (256 bytes) operations in 1001046us (827134.8 ops/sec): 211.7 MB/s
Did 43000 SHA-512 (8192 bytes) operations in 1003381us (42855.1 ops/sec): 351.1 MB/s

---

Before:
Did 5415000 SHA-1 (16 bytes) operations in 1000055us (5414702.2 ops/sec): 86.6 MB/s
Did 1604000 SHA-1 (256 bytes) operations in 1000524us (1603159.9 ops/sec): 410.4 MB/s
Did 71000 SHA-1 (8192 bytes) operations in 1007686us (70458.5 ops/sec): 577.2 MB/s
Did 2984000 SHA-256 (16 bytes) operations in 1000472us (2982592.2 ops/sec): 47.7 MB/s
Did 738000 SHA-256 (256 bytes) operations in 1000885us (737347.4 ops/sec): 188.8 MB/s
Did 30000 SHA-256 (8192 bytes) operations in 1020475us (29398.1 ops/sec): 240.8 MB/s
Did 2297000 SHA-512 (16 bytes) operations in 1000391us (2296102.2 ops/sec): 36.7 MB/s
Did 882000 SHA-512 (256 bytes) operations in 1000389us (881657.0 ops/sec): 225.7 MB/s
Did 43000 SHA-512 (8192 bytes) operations in 1001313us (42943.6 ops/sec): 351.8 MB/s
After:
Did 5228000 SHA-1 (16 bytes) operations in 1000035us (5227817.0 ops/sec): 83.6 MB/s
Did 1575000 SHA-1 (256 bytes) operations in 1000410us (1574354.5 ops/sec): 403.0 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1004180us (68712.8 ops/sec): 562.9 MB/s
Did 2884000 SHA-256 (16 bytes) operations in 1000093us (2883731.8 ops/sec): 46.1 MB/s
Did 718000 SHA-256 (256 bytes) operations in 1000413us (717703.6 ops/sec): 183.7 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1030257us (30089.6 ops/sec): 246.5 MB/s
Did 2286000 SHA-512 (16 bytes) operations in 1000172us (2285606.9 ops/sec): 36.6 MB/s
Did 979000 SHA-512 (256 bytes) operations in 1000384us (978624.2 ops/sec): 250.5 MB/s
Did 47000 SHA-512 (8192 bytes) operations in 1017846us (46175.9 ops/sec): 378.3 MB/s

---

Before:
Did 5429000 SHA-1 (16 bytes) operations in 1000104us (5428435.4 ops/sec): 86.9 MB/s
Did 1604000 SHA-1 (256 bytes) operations in 1000473us (1603241.7 ops/sec): 410.4 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1002621us (68819.6 ops/sec): 563.8 MB/s
Did 3021000 SHA-256 (16 bytes) operations in 1000152us (3020540.9 ops/sec): 48.3 MB/s
Did 735000 SHA-256 (256 bytes) operations in 1000048us (734964.7 ops/sec): 188.2 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019902us (30395.1 ops/sec): 249.0 MB/s
Did 2301000 SHA-512 (16 bytes) operations in 1000207us (2300523.8 ops/sec): 36.8 MB/s
Did 881000 SHA-512 (256 bytes) operations in 1001122us (880012.6 ops/sec): 225.3 MB/s
Did 44000 SHA-512 (8192 bytes) operations in 1015313us (43336.4 ops/sec): 355.0 MB/s
After:
Did 5264000 SHA-1 (16 bytes) operations in 1000061us (5263678.9 ops/sec): 84.2 MB/s
Did 1587000 SHA-1 (256 bytes) operations in 1000293us (1586535.1 ops/sec): 406.2 MB/s
Did 71000 SHA-1 (8192 bytes) operations in 1007587us (70465.4 ops/sec): 577.3 MB/s
Did 2967000 SHA-256 (16 bytes) operations in 1000240us (2966288.1 ops/sec): 47.5 MB/s
Did 737000 SHA-256 (256 bytes) operations in 1000874us (736356.4 ops/sec): 188.5 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019630us (30403.2 ops/sec): 249.1 MB/s
Did 2326000 SHA-512 (16 bytes) operations in 1000413us (2325039.8 ops/sec): 37.2 MB/s
Did 885000 SHA-512 (256 bytes) operations in 1000253us (884776.2 ops/sec): 226.5 MB/s
Did 44000 SHA-512 (8192 bytes) operations in 1013216us (43426.1 ops/sec): 355.7 MB/s

Change-Id: Ifd4500f4e9f41ffc0f73542141e8888b4d7f1e0b
Reviewed-on: https://boringssl-review.googlesource.com/6652
Reviewed-by: Adam Langley <alangley@gmail.com>
This commit is contained in:
David Benjamin 2015-12-07 13:32:56 -05:00 committed by Adam Langley
parent 8f2d4e344c
commit b04c905da9

View File

@ -140,89 +140,29 @@ extern "C" {
#if defined(DATA_ORDER_IS_BIG_ENDIAN)
#if !defined(PEDANTIC) && defined(__GNUC__) && __GNUC__ >= 2 && \
!defined(OPENSSL_NO_ASM)
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
/* The first macro gives a ~30-40% performance improvement in SHA-256 compiled
* with gcc on P4. This can only be done on x86, where unaligned data fetches
* are possible. */
#define HOST_c2l(c, l) \
(void)({ \
uint32_t r = *((const uint32_t *)(c)); \
__asm__("bswapl %0" : "=r"(r) : "0"(r)); \
(c) += 4; \
(l) = r; \
})
#define HOST_l2c(l, c) \
(void)({ \
uint32_t r = (l); \
__asm__("bswapl %0" : "=r"(r) : "0"(r)); \
*((uint32_t *)(c)) = r; \
(c) += 4; \
r; \
})
#elif defined(__aarch64__) && defined(__BYTE_ORDER__)
#if defined(__ORDER_LITTLE_ENDIAN__) && \
__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define HOST_c2l(c, l) \
(void)({ \
uint32_t r; \
__asm__("rev %w0, %w1" : "=r"(r) : "r"(*((const uint32_t *)(c)))); \
(c) += 4; \
(l) = r; \
})
#define HOST_l2c(l, c) \
(void)({ \
uint32_t r; \
__asm__("rev %w0, %w1" : "=r"(r) : "r"((uint32_t)(l))); \
*((uint32_t *)(c)) = r; \
(c) += 4; \
r; \
})
#elif defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define HOST_c2l(c, l) (void)((l) = *((const uint32_t *)(c)), (c) += 4)
#define HOST_l2c(l, c) (*((uint32_t *)(c)) = (l), (c) += 4, (l))
#endif /* __aarch64__ && __BYTE_ORDER__ */
#endif /* ARCH */
#endif /* !PEDANTIC && GNUC && !NO_ASM */
#ifndef HOST_c2l
#define HOST_c2l(c, l) \
(void)(l = (((uint32_t)(*((c)++))) << 24), \
l |= (((uint32_t)(*((c)++))) << 16), \
l |= (((uint32_t)(*((c)++))) << 8), l |= (((uint32_t)(*((c)++)))))
#endif
#ifndef HOST_l2c
#define HOST_l2c(l, c) \
(void)(*((c)++) = (uint8_t)(((l) >> 24) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 16) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
*((c)++) = (uint8_t)(((l)) & 0xff))
#endif
#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
/* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
#define HOST_c2l(c, l) (void)((l) = *((const uint32_t *)(c)), (c) += 4)
#define HOST_l2c(l, c) (void)(*((uint32_t *)(c)) = (l), (c) += 4, l)
#endif /* OPENSSL_X86 || OPENSSL_X86_64 */
#ifndef HOST_c2l
#define HOST_c2l(c, l) \
(void)(l = (((uint32_t)(*((c)++)))), l |= (((uint32_t)(*((c)++))) << 8), \
l |= (((uint32_t)(*((c)++))) << 16), \
l |= (((uint32_t)(*((c)++))) << 24))
#endif
#ifndef HOST_l2c
#define HOST_l2c(l, c) \
(void)(*((c)++) = (uint8_t)(((l)) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 16) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 24) & 0xff))
#endif
#endif /* DATA_ORDER */