镜像来自
https://github.com/henrydcase/nobs.git
synced 2024-11-26 00:51:22 +00:00
PERF: sidh-p503: Split sub and add into 2 uops instead of 3 (#8)
The performance improvement comes from the fact that on Skylake "add mem, reg" splits into 2 uops - one arithmetic uop and another one for loading a value from mem. However, changing operand order to "add reg, mem" splits into 3 uops: one for arithmetic op, one for load and one additional one for storing the result back. Using separated instruction for loading/storing helps to parallelize execution (load/store and arithmetic instruction is done in parallel if possible) For details, see: https://www.agner.org/optimize/instruction_tables.pdf New: BenchmarkFp503StrongReduce-4 300000000 5.57 ns/op Old: BenchmarkFp503StrongReduce-4 200000000 8.60 ns/op This just improves one function, but more functions can be improved
This commit is contained in:
父節點
e9ddb6fb45
當前提交
ea2ffa2d61
@ -54,14 +54,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
|
||||
MOVQ P503_7, R13
|
||||
|
||||
// Set x <- x - p
|
||||
SUBQ R8, ( 0)(REG_P1)
|
||||
SBBQ R8, ( 8)(REG_P1)
|
||||
SBBQ R8, (16)(REG_P1)
|
||||
SBBQ R9, (24)(REG_P1)
|
||||
SBBQ R10, (32)(REG_P1)
|
||||
SBBQ R11, (40)(REG_P1)
|
||||
SBBQ R12, (48)(REG_P1)
|
||||
SBBQ R13, (56)(REG_P1)
|
||||
MOVQ ( 0)(REG_P1), CX; SUBQ R8, CX; MOVQ CX, ( 0)(REG_P1)
|
||||
MOVQ ( 8)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, ( 8)(REG_P1)
|
||||
MOVQ (16)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, (16)(REG_P1)
|
||||
MOVQ (24)(REG_P1), CX; SBBQ R9, CX; MOVQ CX, (24)(REG_P1)
|
||||
MOVQ (32)(REG_P1), CX; SBBQ R10, CX; MOVQ CX, (32)(REG_P1)
|
||||
MOVQ (40)(REG_P1), CX; SBBQ R11, CX; MOVQ CX, (40)(REG_P1)
|
||||
MOVQ (48)(REG_P1), CX; SBBQ R12, CX; MOVQ CX, (48)(REG_P1)
|
||||
MOVQ (56)(REG_P1), CX; SBBQ R13, CX; MOVQ CX, (56)(REG_P1)
|
||||
|
||||
// Save carry flag indicating x-p < 0 as a mask
|
||||
SBBQ $0, AX
|
||||
@ -74,14 +74,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
|
||||
ANDQ AX, R12
|
||||
ANDQ AX, R13
|
||||
|
||||
ADDQ R8, ( 0)(REG_P1)
|
||||
ADCQ R8, ( 8)(REG_P1)
|
||||
ADCQ R8, (16)(REG_P1)
|
||||
ADCQ R9, (24)(REG_P1)
|
||||
ADCQ R10,(32)(REG_P1)
|
||||
ADCQ R11,(40)(REG_P1)
|
||||
ADCQ R12,(48)(REG_P1)
|
||||
ADCQ R13,(56)(REG_P1)
|
||||
MOVQ ( 0)(REG_P1), CX; ADDQ R8, CX; MOVQ CX, ( 0)(REG_P1)
|
||||
MOVQ ( 8)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, ( 8)(REG_P1)
|
||||
MOVQ (16)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, (16)(REG_P1)
|
||||
MOVQ (24)(REG_P1), CX; ADCQ R9, CX; MOVQ CX, (24)(REG_P1)
|
||||
MOVQ (32)(REG_P1), CX; ADCQ R10, CX; MOVQ CX, (32)(REG_P1)
|
||||
MOVQ (40)(REG_P1), CX; ADCQ R11, CX; MOVQ CX, (40)(REG_P1)
|
||||
MOVQ (48)(REG_P1), CX; ADCQ R12, CX; MOVQ CX, (48)(REG_P1)
|
||||
MOVQ (56)(REG_P1), CX; ADCQ R13, CX; MOVQ CX, (56)(REG_P1)
|
||||
|
||||
RET
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user