Sfoglia il codice sorgente

perf p503: improve performance by using branch predictor

In case of x86, there are few implementation of the montgomery reduction
and multiplication. At runtime, library chooses most performant
implementation according to information received from CPUID. The
implementation then is assigned to a function pointer which gets CALL'd
during program execution.

The problem is a mixture of following; function pointer points to an
assembly function, it's arguments are also pointers to some data and
arguments passed to the call are allocated on a stack before a call.

As variable can't be tagged as go:noescape, the go compiler will move
arguments passed to the call from stack to heap. This causes significant
performance degradataion.

The solution is not to use function pointer. Instead, both redc and mul,
will check at runtime CPU capabilities and do the JMP to the correct
part of the text. Thanks to branch prediction the cost of the solution
is minimal and smaller than function call. This patch also removes all
heap allocations done by functions operating on prime field.

The other goal of this patch is to remove x86 specific code from
arith_decl.go, which will be also compiled for ARM arch in near future.

Results:
--------
benchmark                            old ns/op     new ns/op     delta
BenchmarkFp2ElementMul               428           194           -54.67%
BenchmarkFp2ElementInv               67353         34447         -48.86%
BenchmarkFp2ElementSquare            335           139           -58.51%
BenchmarkFp503MontgomeryReduce       26.3          22.8          -13.31%
BenchmarkSidhKeyAgreementP503        12451199      6402396       -48.58%
BenchmarkAliceKeyGenPubP503          7349333       3590954       -51.14%
BenchmarkBobKeyGenPubP503            8253676       4094141       -50.40%
BenchmarkSharedSecretAliceP503       5888022       2916821       -50.46%
BenchmarkSharedSecretBobP503         6908018       3436713       -50.25%

Comparision with P751:
----------------------
BenchmarkBobKeyGenPubP751           13616876
BenchmarkBobKeyGenPubP503            4094141
BenchmarkSharedSecretAliceP751       9870216
BenchmarkSharedSecretAliceP503       2916821

There is a cost - possibly CMPB & JE each time mul and redc is called.
Also patch introduces arith_amd64.go file which keeps some variables -
this currently needs to be done for each field. Probably it could be
possible to get rid of it at some point.
master
Kris Kwiatkowski 6 anni fa
committed by Kris Kwiatkowski
parent
commit
e6c10dbbe5
4 ha cambiato i file con 132 aggiunte e 91 eliminazioni
  1. +24
    -0
      p503/arith_amd64.go
  2. +39
    -33
      p503/arith_amd64.s
  3. +66
    -12
      p503/arith_amd64_test.go
  4. +3
    -46
      p503/arith_decl.go

+ 24
- 0
p503/arith_amd64.go Vedi File

@@ -0,0 +1,24 @@
// +build amd64,!noasm

package p503

import cpu "github.com/cloudflare/p751sidh/internal/utils"

// There couple of reasons for having those variables here:
// * to have an access to them from assembly
// TODO(kk): Is there a way to access variable from different package?
// If it is then probably this file could be moved to internal
// and we don't need to have many copies of that
// * make it easy to vendor the library
// * make it possible to test all functionalities
var useMULX bool
var useADXMULX bool

func recognizecpu() {
useMULX = cpu.HasBMI2
useADXMULX = cpu.HasADX && cpu.HasBMI2
}

func init() {
recognizecpu()
}

+ 39
- 33
p503/arith_amd64.s Vedi File

@@ -354,7 +354,8 @@
// Template for calculating the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
// customized with schoolbook multiplicaton for 128 x 320-bit number.
// This macro reuses memory of IN value and *changes* it.
// This macro reuses memory of IN value and *changes* it. Smashes registers
// R[8-15], BX, CX
// Input:
// * IN: 1024-bit number to be reduced
// * MULS: either MULS_128x320_MULX or MULS_128x320_MULXADX
@@ -690,34 +691,23 @@ TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24

RET

TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
// Actual implementation
TEXT ·fp503Mul(SB), NOSPLIT, $104-24
MOVQ z+ 0(FP), CX
MOVQ x+ 8(FP), REG_P2
MOVQ y+16(FP), REG_P1
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
RET
MOVQ x+ 8(FP), REG_P1
MOVQ y+16(FP), REG_P2

TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
// Actual implementation
MOVQ z+ 0(FP), CX
MOVQ x+ 8(FP), REG_P2
MOVQ y+16(FP), REG_P1
MUL(CX, REG_P2, REG_P1, MULS256_MULX)
RET
// Check wether to use optimized implementation
CMPB ·useADXMULX(SB), $1
JE mul_with_mulx_adx
CMPB ·useMULX(SB), $1
JE mul_with_mulx

TEXT ·mul(SB), $96-24
// Uses variant of Karatsuba method.
// Generic x86 implementation (below) uses variant of Karatsuba method.
//
// Here we store the destination in CX instead of in REG_P3 because the
// multiplication instructions use DX as an implicit destination
// operand: MULQ $REG sets DX:AX <-- AX * $REG.

// Actual implementation
MOVQ z+0(FP), CX
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2

// RAX and RDX will be used for a mask (0-borrow)
XORQ AX, AX

@@ -1186,12 +1176,28 @@ TEXT ·mul(SB), $96-24
ADCQ $0, DX; MOVQ DX, (104)(CX)
ADCQ $0, DI; MOVQ DI, (112)(CX)
ADCQ $0, SI; MOVQ SI, (120)(CX)
RET

mul_with_mulx_adx:
// Mul implementation for CPUs supporting two independent carry chain
// (ADOX/ADCX) instructions and carry-less MULX multiplier
MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
RET

TEXT ·redc(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
mul_with_mulx:
// Mul implementation for CPUs supporting carry-less MULX multiplier.
MUL(CX, REG_P1, REG_P2, MULS256_MULX)
RET

TEXT ·fp503MontgomeryReduce(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1

// Check wether to use optimized implementation
CMPB ·useADXMULX(SB), $1
JE redc_with_mulx_adx
CMPB ·useMULX(SB), $1
JE redc_with_mulx

MOVQ (REG_P1), R11
MOVQ P503P1_3, AX
@@ -1495,19 +1501,19 @@ TEXT ·redc(SB), $0-16
ADCQ $0, R10
ADDQ (120)(REG_P1), R10 // Z7
MOVQ R10, (56)(REG_P2) // Z7

RET

TEXT ·redcWithMULX(SB), $0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
REDC(DI, SI, MULS_128x320_MULX)
redc_with_mulx_adx:
// Implementation of the Montgomery reduction for CPUs
// supporting two independent carry chain (ADOX/ADCX)
// instructions and carry-less MULX multiplier
REDC(REG_P2, REG_P1, MULS_128x320_MULXADX)
RET

TEXT ·redcWithMULXADX(SB), $0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
REDC(DI, SI, MULS_128x320_MULXADX)
redc_with_mulx:
// Implementation of the Montgomery reduction for CPUs
// supporting carry-less MULX multiplier.
REDC(REG_P2, REG_P1, MULS_128x320_MULX)
RET

TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24


+ 66
- 12
p503/arith_amd64_test.go Vedi File

@@ -10,13 +10,32 @@ import (
"testing/quick"
)

type OptimFlag uint

const (
kUse_MUL OptimFlag = 1 << 0
kUse_MULX = 1 << 1
kUse_MULXADX = 1 << 2
)

// Utility function used for testing Mul implementations. Tests caller provided
// mulFunc against mul()
func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) {
func testMul(t *testing.T, f1, f2 OptimFlag) {
doMulTest := func(multiplier, multiplicant FpElement) bool {
defer recognizecpu()
var resMulRef, resMulOptim FpElementX2
mul(&resMulRef, &multiplier, &multiplicant)
mulFunc(&resMulOptim, &multiplier, &multiplicant)

// Compute multiplier*multiplicant with first implementation
useMULX = (kUse_MULX & f1) == kUse_MULX
useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX
fp503Mul(&resMulOptim, &multiplier, &multiplicant)

// Compute multiplier*multiplicant with second implementation
useMULX = (kUse_MULX & f2) == kUse_MULX
useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX
fp503Mul(&resMulRef, &multiplier, &multiplicant)

// Compare results
return reflect.DeepEqual(resMulRef, resMulOptim)
}

@@ -27,13 +46,24 @@ func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) {

// Utility function used for testing REDC implementations. Tests caller provided
// redcFunc against redc()
func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) {
func testRedc(t *testing.T, f1, f2 OptimFlag) {
doRedcTest := func(aRR FpElementX2) bool {
var resRedcRef, resRedcOptim FpElement
defer recognizecpu()
var resRedcF1, resRedcF2 FpElement
var aRRcpy = aRR
redc(&resRedcRef, &aRR)
redcFunc(&resRedcOptim, &aRRcpy)
return reflect.DeepEqual(resRedcRef, resRedcOptim)

// Compute redc with first implementation
useMULX = (kUse_MULX & f1) == kUse_MULX
useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX
fp503MontgomeryReduce(&resRedcF1, &aRR)

// Compute redc with second implementation
useMULX = (kUse_MULX & f2) == kUse_MULX
useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX
fp503MontgomeryReduce(&resRedcF2, &aRRcpy)

// Compare results
return reflect.DeepEqual(resRedcF2, resRedcF1)
}

if err := quick.Check(doRedcTest, quickCheckConfig); err != nil {
@@ -43,32 +73,56 @@ func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) {

// Ensures corretness of implementation of mul operation which uses MULX
func TestMulWithMULX(t *testing.T) {
defer recognizecpu()
if !cpu.HasBMI2 {
t.Skip("MULX not supported by the platform")
}
testMul(t, mulWithMULX)
testMul(t, kUse_MULX, kUse_MUL)
}

// Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX
func TestMulWithMULXADX(t *testing.T) {
defer recognizecpu()
if !(cpu.HasADX && cpu.HasBMI2) {
t.Skip("MULX, ADCX and ADOX not supported by the platform")
}
testMul(t, mulWithMULXADX)
testMul(t, kUse_MULXADX, kUse_MUL)
}

// Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX
func TestMulWithMULXADXAgainstMULX(t *testing.T) {
defer recognizecpu()
if !(cpu.HasADX && cpu.HasBMI2) {
t.Skip("MULX, ADCX and ADOX not supported by the platform")
}
testMul(t, kUse_MULX, kUse_MULXADX)
}

// Ensures corretness of Montgomery reduction implementation which uses MULX
func TestRedcWithMULX(t *testing.T) {
defer recognizecpu()
if !cpu.HasBMI2 {
t.Skip("MULX not supported by the platform")
}
testRedc(t, redcWithMULX)
testRedc(t, kUse_MULX, kUse_MUL)
}

// Ensures corretness of Montgomery reduction implementation which uses MULX
// and ADX
func TestRedcWithMULXADX(t *testing.T) {
defer recognizecpu()
if !(cpu.HasADX && cpu.HasBMI2) {
t.Skip("MULX, ADCX and ADOX not supported by the platform")
}
testRedc(t, kUse_MULXADX, kUse_MUL)
}

// Ensures corretness of Montgomery reduction implementation which uses MULX
// and ADX.
func TestRedcWithMULXADXAgainstMULX(t *testing.T) {
defer recognizecpu()
if !(cpu.HasADX && cpu.HasBMI2) {
t.Skip("MULX, ADCX and ADOX not supported by the platform")
}
testRedc(t, redcWithMULXADX)
testRedc(t, kUse_MULXADX, kUse_MULX)
}

+ 3
- 46
p503/arith_decl.go Vedi File

@@ -4,7 +4,6 @@ package p503

import (
. "github.com/cloudflare/sidh/internal/isogeny"
cpu "github.com/cloudflare/sidh/internal/utils"
)

// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -37,53 +36,11 @@ func fp503X2SubLazy(z, x, y *FpElementX2)
//go:noescape
func fp503StrongReduce(x *FpElement)

// Function pointer to function computing z = x * y.
// Concrete implementation depends on capabilities of the CPU which
// are resolved at runtime. CPUs with ADCX, ADOX and MULX support
// run most optimized implementation
var fp503Mul func(z *FpElementX2, x, y *FpElement)

// Mul implementattion for legacy CPUs
//go:noescape
func mul(z *FpElementX2, x, y *FpElement)

// Mul implementation for CPUs supporting carry-less MULX multiplier.
//go:noescape
func mulWithMULX(z *FpElementX2, x, y *FpElement)

// Mul implementation for CPUs supporting two independent carry chain
// (ADOX/ADCX) instructions and carry-less MULX multiplier
// Computes z = x * y.
//go:noescape
func mulWithMULXADX(z *FpElementX2, x, y *FpElement)
func fp503Mul(z *FpElementX2, x, y *FpElement)

// Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value
// of x may be changed. z=x not allowed.
var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2)

func redc(z *FpElement, x *FpElementX2)

// Mul implementation for CPUs supporting carry-less MULX multiplier.
//go:noescape
func redcWithMULX(z *FpElement, x *FpElementX2)

// Mul implementation for CPUs supporting two independent carry chain
// (ADOX/ADCX) instructions and carry-less MULX multiplier
//go:noescape
func redcWithMULXADX(z *FpElement, x *FpElementX2)

// On initialization, set the fp503Mul function pointer to the
// fastest implementation depending on CPU capabilities.
func init() {
if cpu.HasBMI2 {
if cpu.HasADX {
fp503Mul = mulWithMULXADX
fp503MontgomeryReduce = redcWithMULXADX
} else {
fp503Mul = mulWithMULX
fp503MontgomeryReduce = redcWithMULX
}
} else {
fp503Mul = mul
fp503MontgomeryReduce = redc
}
}
func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)

Caricamento…
Annulla
Salva