From 272e3d8f6cd1f89052d7fd44966ce2636b668a64 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 18 Nov 2018 20:28:32 +0000 Subject: [PATCH] PERF: sidh-p503: Split sub and add into 2 uops instead of 3 The performance improvement comes from the fact that on Skylake "add mem, reg" splits into 2 uops - one arithmetic uop and another one for loading a value from mem. However, changing operand order to "add reg, mem" splits into 3 uops: one for arithmetic op, one for load and one additional one for storing the result back. Using separated instruction for loading/storing helps to parallelize execution (load/store and arithmetic instruction is done in parallel if possible) For details, see: https://www.agner.org/optimize/instruion_tables.pdf New: BenchmarkFp503StrongReduce-4 300000000 5.57 ns/op Old: BenchmarkFp503StrongReduce-4 200000000 8.60 ns/op This just improves one function, but more functions can be improved --- dh/sidh/p503/arith_amd64.s | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/dh/sidh/p503/arith_amd64.s b/dh/sidh/p503/arith_amd64.s index 5968754..7876552 100644 --- a/dh/sidh/p503/arith_amd64.s +++ b/dh/sidh/p503/arith_amd64.s @@ -54,14 +54,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8 MOVQ P503_7, R13 // Set x <- x - p - SUBQ R8, ( 0)(REG_P1) - SBBQ R8, ( 8)(REG_P1) - SBBQ R8, (16)(REG_P1) - SBBQ R9, (24)(REG_P1) - SBBQ R10, (32)(REG_P1) - SBBQ R11, (40)(REG_P1) - SBBQ R12, (48)(REG_P1) - SBBQ R13, (56)(REG_P1) + MOVQ ( 0)(REG_P1), CX; SUBQ R8, CX; MOVQ CX, ( 0)(REG_P1) + MOVQ ( 8)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, ( 8)(REG_P1) + MOVQ (16)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, (16)(REG_P1) + MOVQ (24)(REG_P1), CX; SBBQ R9, CX; MOVQ CX, (24)(REG_P1) + MOVQ (32)(REG_P1), CX; SBBQ R10, CX; MOVQ CX, (32)(REG_P1) + MOVQ (40)(REG_P1), CX; SBBQ R11, CX; MOVQ CX, (40)(REG_P1) + MOVQ (48)(REG_P1), CX; SBBQ R12, CX; MOVQ CX, (48)(REG_P1) + MOVQ (56)(REG_P1), CX; SBBQ R13, CX; MOVQ CX, (56)(REG_P1) // Save carry flag indicating x-p < 0 as a mask SBBQ $0, AX @@ -74,14 +74,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8 ANDQ AX, R12 ANDQ AX, R13 - ADDQ R8, ( 0)(REG_P1) - ADCQ R8, ( 8)(REG_P1) - ADCQ R8, (16)(REG_P1) - ADCQ R9, (24)(REG_P1) - ADCQ R10,(32)(REG_P1) - ADCQ R11,(40)(REG_P1) - ADCQ R12,(48)(REG_P1) - ADCQ R13,(56)(REG_P1) + MOVQ ( 0)(REG_P1), CX; ADDQ R8, CX; MOVQ CX, ( 0)(REG_P1) + MOVQ ( 8)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, ( 8)(REG_P1) + MOVQ (16)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, (16)(REG_P1) + MOVQ (24)(REG_P1), CX; ADCQ R9, CX; MOVQ CX, (24)(REG_P1) + MOVQ (32)(REG_P1), CX; ADCQ R10, CX; MOVQ CX, (32)(REG_P1) + MOVQ (40)(REG_P1), CX; ADCQ R11, CX; MOVQ CX, (40)(REG_P1) + MOVQ (48)(REG_P1), CX; ADCQ R12, CX; MOVQ CX, (48)(REG_P1) + MOVQ (56)(REG_P1), CX; ADCQ R13, CX; MOVQ CX, (56)(REG_P1) RET