1
0
kopie van https://github.com/henrydcase/nobs.git synced 2024-11-22 15:18:57 +00:00

PERF: sidh-p503: Split sub and add into 2 uops instead of 3 (#8)

The performance improvement comes from the fact that on Skylake
"add mem, reg" splits into 2 uops - one arithmetic uop and another one
for loading a value from mem.
However, changing operand order to "add reg, mem" splits into 3 uops:
one for arithmetic op, one for load and one additional one for storing
the result back.
Using separated instruction for loading/storing helps to parallelize
execution (load/store and arithmetic instruction is done in parallel
if possible)

For details, see: https://www.agner.org/optimize/instruction_tables.pdf

New: BenchmarkFp503StrongReduce-4    300000000            5.57 ns/op
Old: BenchmarkFp503StrongReduce-4    200000000            8.60 ns/op

This just improves one function, but more functions can be improved
This commit is contained in:
Henry Case 2018-11-18 20:50:41 +00:00 gecommit door Kris Kwiatkowski
bovenliggende e9ddb6fb45
commit ea2ffa2d61

Bestand weergeven

@ -54,14 +54,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
MOVQ P503_7, R13
// Set x <- x - p
SUBQ R8, ( 0)(REG_P1)
SBBQ R8, ( 8)(REG_P1)
SBBQ R8, (16)(REG_P1)
SBBQ R9, (24)(REG_P1)
SBBQ R10, (32)(REG_P1)
SBBQ R11, (40)(REG_P1)
SBBQ R12, (48)(REG_P1)
SBBQ R13, (56)(REG_P1)
MOVQ ( 0)(REG_P1), CX; SUBQ R8, CX; MOVQ CX, ( 0)(REG_P1)
MOVQ ( 8)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, ( 8)(REG_P1)
MOVQ (16)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, (16)(REG_P1)
MOVQ (24)(REG_P1), CX; SBBQ R9, CX; MOVQ CX, (24)(REG_P1)
MOVQ (32)(REG_P1), CX; SBBQ R10, CX; MOVQ CX, (32)(REG_P1)
MOVQ (40)(REG_P1), CX; SBBQ R11, CX; MOVQ CX, (40)(REG_P1)
MOVQ (48)(REG_P1), CX; SBBQ R12, CX; MOVQ CX, (48)(REG_P1)
MOVQ (56)(REG_P1), CX; SBBQ R13, CX; MOVQ CX, (56)(REG_P1)
// Save carry flag indicating x-p < 0 as a mask
SBBQ $0, AX
@ -74,14 +74,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
ANDQ AX, R12
ANDQ AX, R13
ADDQ R8, ( 0)(REG_P1)
ADCQ R8, ( 8)(REG_P1)
ADCQ R8, (16)(REG_P1)
ADCQ R9, (24)(REG_P1)
ADCQ R10,(32)(REG_P1)
ADCQ R11,(40)(REG_P1)
ADCQ R12,(48)(REG_P1)
ADCQ R13,(56)(REG_P1)
MOVQ ( 0)(REG_P1), CX; ADDQ R8, CX; MOVQ CX, ( 0)(REG_P1)
MOVQ ( 8)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, ( 8)(REG_P1)
MOVQ (16)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, (16)(REG_P1)
MOVQ (24)(REG_P1), CX; ADCQ R9, CX; MOVQ CX, (24)(REG_P1)
MOVQ (32)(REG_P1), CX; ADCQ R10, CX; MOVQ CX, (32)(REG_P1)
MOVQ (40)(REG_P1), CX; ADCQ R11, CX; MOVQ CX, (40)(REG_P1)
MOVQ (48)(REG_P1), CX; ADCQ R12, CX; MOVQ CX, (48)(REG_P1)
MOVQ (56)(REG_P1), CX; ADCQ R13, CX; MOVQ CX, (56)(REG_P1)
RET