Explorar el Código

perf p751: improve performance by using branch predictor

Similar change as below for P751

BenchmarkSidhKeyAgreementP751      23839233      20241485      -15.09%
BenchmarkAliceKeyGenPubP751        13733691      11180516      -18.59%
BenchmarkBobKeyGenPubP751          14804459      12889797      -12.93%
BenchmarkSharedSecretAliceP751     10649209      9155030       -14.03%
BenchmarkSharedSecretBobP751       12739399      10751825      -15.60%
master
Henry Case hace 6 años
committed by Kris Kwiatkowski
padre
commit
cf9e40db2b
Se han modificado 4 ficheros con 222 adiciones y 242 borrados
  1. +24
    -0
      p751/arith_amd64.go
  2. +147
    -168
      p751/arith_amd64.s
  3. +50
    -50
      p751/arith_amd64_test.go
  4. +1
    -24
      p751/arith_decl.go

+ 24
- 0
p751/arith_amd64.go Ver fichero

@@ -0,0 +1,24 @@
// +build amd64,!noasm

package p751

import cpu "github.com/cloudflare/p751sidh/internal/utils"

// There couple of reasons for having those variables here:
// * to have an access to them from assembly
// TODO(kk): Is there a way to access variable from different package?
// If it is then probably this file could be moved to internal
// and we don't need to have many copies of that
// * make it easy to vendor the library
// * make it possible to test all functionalities
var useMULX bool
var useADXMULX bool

func recognizecpu() {
useMULX = cpu.HasBMI2
useADXMULX = cpu.HasADX && cpu.HasBMI2
}

func init() {
recognizecpu()
}

+ 147
- 168
p751/arith_amd64.s Ver fichero

@@ -1607,177 +1607,143 @@ TEXT ·fp751Mul(SB), $96-24
ADCQ DX, T2 \
ADCQ AX, T4

#define fp751MontgomeryReduceCommonPart1(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
XORQ T7, T7 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, T9 \
ADDQ 40+M0, AX \
ADCQ 48+M0, DX \
ADCQ 56+M0, T9 \
MOVQ AX, 40+M0 \
MOVQ DX, 48+M0 \
MOVQ T9, 56+M0 \
ADCQ 64+M0, T8 \
ADCQ 72+M0, T0 \
ADCQ 80+M0, T1 \
ADCQ 88+M0, T2 \
ADCQ 96+M0, T3 \
ADCQ 104+M0, T4 \
ADCQ 112+M0, T5 \
ADCQ 120+M0, T6 \
ADCQ 128+M0, T7 \
MOVQ T8, 64+M0 \
MOVQ T0, 72+M0 \
MOVQ T1, 80+M0 \
MOVQ T2, 88+M0 \
MOVQ T3, 96+M0 \
MOVQ T4, 104+M0 \
MOVQ T5, 112+M0 \
MOVQ T6, 120+M0 \
MOVQ T7, 128+M0 \
MOVQ 136+M0, T0 \
MOVQ 144+M0, T1 \
MOVQ 152+M0, T2 \
MOVQ 160+M0, T3 \
MOVQ 168+M0, T4 \
MOVQ 176+M0, T5 \
MOVQ 184+M0, T6 \
ADCQ $0, T0 \
ADCQ $0, T1 \
ADCQ $0, T2 \
ADCQ $0, T3 \
ADCQ $0, T4 \
ADCQ $0, T5 \
ADCQ $0, T6 \
MOVQ T0, 136+M0 \
MOVQ T1, 144+M0 \
MOVQ T2, 152+M0 \
MOVQ T3, 160+M0 \
MOVQ T4, 168+M0 \
MOVQ T5, 176+M0 \
MOVQ T6, 184+M0

#define fp751MontgomeryReduceCommonPart2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
XORQ T7, T7 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, T9 \
ADDQ 72+M0, AX \
ADCQ 80+M0, DX \
ADCQ 88+M0, T9 \
MOVQ AX, 72+M0 \
MOVQ DX, 80+M0 \
MOVQ T9, 88+M0 \
ADCQ 96+M0, T8 \
ADCQ 104+M0, T0 \
ADCQ 112+M0, T1 \
ADCQ 120+M0, T2 \
ADCQ 128+M0, T3 \
ADCQ 136+M0, T4 \
ADCQ 144+M0, T5 \
ADCQ 152+M0, T6 \
ADCQ 160+M0, T7 \
MOVQ T8, 0+C \ // Final result c0
MOVQ T0, 104+M0 \
MOVQ T1, 112+M0 \
MOVQ T2, 120+M0 \
MOVQ T3, 128+M0 \
MOVQ T4, 136+M0 \
MOVQ T5, 144+M0 \
MOVQ T6, 152+M0 \
MOVQ T7, 160+M0 \
MOVQ 168+M0, T4 \
MOVQ 176+M0, T5 \
MOVQ 184+M0, T6 \
ADCQ $0, T4 \
ADCQ $0, T5 \
ADCQ $0, T6 \
MOVQ T4, 168+M0 \
MOVQ T5, 176+M0 \
MOVQ T6, 184+M0

#define fp751MontgomeryReduceCommonPart3(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
MOVQ 48+C, AX \ // Final result c1:c11
MOVQ 56+C, DX \
MOVQ 64+C, T9 \
ADDQ 104+M0, AX \
ADCQ 112+M0, DX \
ADCQ 120+M0, T9 \
MOVQ AX, 8+C \
MOVQ DX, 16+C \
MOVQ T9, 24+C \
ADCQ 128+M0, T8 \
ADCQ 136+M0, T0 \
ADCQ 144+M0, T1 \
ADCQ 152+M0, T2 \
ADCQ 160+M0, T3 \
ADCQ 168+M0, T4 \
ADCQ 176+M0, T5 \
ADCQ 184+M0, T6 \
MOVQ T8, 32+C \
MOVQ T0, 40+C \
MOVQ T1, 48+C \
MOVQ T2, 56+C \
MOVQ T3, 64+C \
MOVQ T4, 72+C \
MOVQ T5, 80+C \
MOVQ T6, 88+C

// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 and ADX instruction set extensions are available.
TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16
// Template for calculating the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
// customized with schoolbook multiplicaton for 256 x 448-bit number.
// This macro reuses memory of IN value and *changes* it. Smashes registers
// R[8-15], AX, BX, CX, DX, BP.
// Input:
// * M0: 1536-bit number to be reduced
// * C : either mul256x448bmi2 or mul256x448bmi2adx
// Output: OUT 768-bit
#define REDC(C, M0, MULS) \
\ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
XORQ R15, R15 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, BX \
ADDQ 40+M0, AX \
ADCQ 48+M0, DX \
ADCQ 56+M0, BX \
MOVQ AX, 40+M0 \
MOVQ DX, 48+M0 \
MOVQ BX, 56+M0 \
ADCQ 64+M0, BP \
ADCQ 72+M0, R8 \
ADCQ 80+M0, R9 \
ADCQ 88+M0, R10 \
ADCQ 96+M0, R11 \
ADCQ 104+M0, R12 \
ADCQ 112+M0, R13 \
ADCQ 120+M0, R14 \
ADCQ 128+M0, R15 \
MOVQ BP, 64+M0 \
MOVQ R8, 72+M0 \
MOVQ R9, 80+M0 \
MOVQ R10, 88+M0 \
MOVQ R11, 96+M0 \
MOVQ R12, 104+M0 \
MOVQ R13, 112+M0 \
MOVQ R14, 120+M0 \
MOVQ R15, 128+M0 \
MOVQ 136+M0, R8 \
MOVQ 144+M0, R9 \
MOVQ 152+M0, R10 \
MOVQ 160+M0, R11 \
MOVQ 168+M0, R12 \
MOVQ 176+M0, R13 \
MOVQ 184+M0, R14 \
ADCQ $0, R8 \
ADCQ $0, R9 \
ADCQ $0, R10 \
ADCQ $0, R11 \
ADCQ $0, R12 \
ADCQ $0, R13 \
ADCQ $0, R14 \
MOVQ R8, 136+M0 \
MOVQ R9, 144+M0 \
MOVQ R10, 152+M0 \
MOVQ R11, 160+M0 \
MOVQ R12, 168+M0 \
MOVQ R13, 176+M0 \
MOVQ R14, 184+M0 \
\ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(32+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
XORQ R15, R15 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, BX \
ADDQ 72+M0, AX \
ADCQ 80+M0, DX \
ADCQ 88+M0, BX \
MOVQ AX, 72+M0 \
MOVQ DX, 80+M0 \
MOVQ BX, 88+M0 \
ADCQ 96+M0, BP \
ADCQ 104+M0, R8 \
ADCQ 112+M0, R9 \
ADCQ 120+M0, R10 \
ADCQ 128+M0, R11 \
ADCQ 136+M0, R12 \
ADCQ 144+M0, R13 \
ADCQ 152+M0, R14 \
ADCQ 160+M0, R15 \
MOVQ BP, 0+C \ // Final result c0
MOVQ R8, 104+M0 \
MOVQ R9, 112+M0 \
MOVQ R10, 120+M0 \
MOVQ R11, 128+M0 \
MOVQ R12, 136+M0 \
MOVQ R13, 144+M0 \
MOVQ R14, 152+M0 \
MOVQ R15, 160+M0 \
MOVQ 168+M0, R12 \
MOVQ 176+M0, R13 \
MOVQ 184+M0, R14 \
ADCQ $0, R12 \
ADCQ $0, R13 \
ADCQ $0, R14 \
MOVQ R12, 168+M0 \
MOVQ R13, 176+M0 \
MOVQ R14, 184+M0 \
\ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(64+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
MOVQ 48+C, AX \ // Final result c1:c11
MOVQ 56+C, DX \
MOVQ 64+C, BX \
ADDQ 104+M0, AX \
ADCQ 112+M0, DX \
ADCQ 120+M0, BX \
MOVQ AX, 8+C \
MOVQ DX, 16+C \
MOVQ BX, 24+C \
ADCQ 128+M0, BP \
ADCQ 136+M0, R8 \
ADCQ 144+M0, R9 \
ADCQ 152+M0, R10 \
ADCQ 160+M0, R11 \
ADCQ 168+M0, R12 \
ADCQ 176+M0, R13 \
ADCQ 184+M0, R14 \
MOVQ BP, 32+C \
MOVQ R8, 40+C \
MOVQ R9, 48+C \
MOVQ R10, 56+C \
MOVQ R11, 64+C \
MOVQ R12, 72+C \
MOVQ R13, 80+C \
MOVQ R14, 88+C

TEXT ·fp751MontgomeryReduce(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1

// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2adx(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2adx(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2adx(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

RET

// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 instruction set extension is available.
TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1

// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

RET

// This implements the straightforward Montgomery reduction algorithm without
// using specific instruction set extensions.
TEXT ·fp751MontgomeryReduceFallback(SB), $0-16

MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
// Check wether to use optimized implementation
CMPB ·useADXMULX(SB), $1
JE redc_with_mulx_adx
CMPB ·useMULX(SB), $1
JE redc_with_mulx

MOVQ (REG_P1), R11
MOVQ P751P1_5, AX
@@ -2379,7 +2345,20 @@ TEXT ·fp751MontgomeryReduceFallback(SB), $0-16
ADCQ $0, R10
ADDQ (184)(REG_P1), R10 // Z11
MOVQ R10, (88)(REG_P2) // Z11
RET

redc_with_mulx_adx:
// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 and ADX instruction set extensions are available.
REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2adx)
RET

redc_with_mulx:
// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 instruction set extension is available.
REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2)
RET

TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24


+ 50
- 50
p751/arith_amd64_test.go Ver fichero

@@ -10,66 +10,66 @@ import (
"testing/quick"
)

func TestFp751MontgomeryReduce(t *testing.T) {
// First make sure that at least one value with a known result reduces
// correctly as defined in TestPrimeFieldElementToBigInt.
fp751MontgomeryReduce = fp751MontgomeryReduceFallback
t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)
type OptimFlag uint

if !cpu.HasBMI2 {
return
}

fp751MontgomeryReduce = fp751MontgomeryReduceBMI2
t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)

// Also check that the BMI2 implementation produces the same results
// as the fallback implementation.
compareMontgomeryReduce := func(x, y primeFieldElement) bool {
var z, zbackup FpElementX2
var zred1, zred2 FpElement

fp751Mul(&z, &x.A, &y.A)
zbackup = z

fp751MontgomeryReduceFallback(&zred1, &z)
// z may be destroyed.
z = zbackup
fp751MontgomeryReduceBMI2(&zred2, &z)
const (
kUse_MUL OptimFlag = 1 << 0
kUse_MULX = 1 << 1
kUse_MULXADX = 1 << 2
)

return zred1 == zred2
// Utility function used for testing REDC implementations. Tests caller provided
// redcFunc against redc()
func testRedc(t *testing.T, f1, f2 OptimFlag) {
doRedcTest := func(aRR FpElementX2) bool {
defer recognizecpu()
var resRedcF1, resRedcF2 FpElement
var aRRcpy = aRR

// Compute redc with first implementation
useMULX = (kUse_MULX & f1) == kUse_MULX
useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX
fp751MontgomeryReduce(&resRedcF1, &aRR)

// Compute redc with second implementation
useMULX = (kUse_MULX & f2) == kUse_MULX
useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX
fp751MontgomeryReduce(&resRedcF2, &aRRcpy)

// Compare results
return reflect.DeepEqual(resRedcF2, resRedcF1)
}

if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil {
if err := quick.Check(doRedcTest, quickCheckConfig); err != nil {
t.Error(err)
}
}

if !cpu.HasADX {
return
// Ensures corretness of Montgomery reduction implementation which uses MULX
func TestRedcWithMULX(t *testing.T) {
defer recognizecpu()
if !cpu.HasBMI2 {
t.Skip("MULX not supported by the platform")
}
testRedc(t, kUse_MULX, kUse_MUL)
}

fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX
t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)

// Check that the BMI2ADX implementation produces the same results as
// the BMI2 implementation. By transitivity, it should also produce the
// same results as the fallback implementation.
compareMontgomeryReduce = func(x, y primeFieldElement) bool {
var z, zbackup FpElementX2
var zred1, zred2 FpElement

fp751Mul(&z, &x.A, &y.A)
zbackup = z

fp751MontgomeryReduceBMI2(&zred1, &z)
// z may be destroyed.
z = zbackup
fp751MontgomeryReduceBMI2ADX(&zred2, &z)

return zred1 == zred2
// Ensures corretness of Montgomery reduction implementation which uses MULX
// and ADX
func TestRedcWithMULXADX(t *testing.T) {
defer recognizecpu()
if !(cpu.HasADX && cpu.HasBMI2) {
t.Skip("MULX, ADCX and ADOX not supported by the platform")
}
testRedc(t, kUse_MULXADX, kUse_MUL)
}

if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil {
t.Error(err)
// Ensures corretness of Montgomery reduction implementation which uses MULX
// and ADX.
func TestRedcWithMULXADXAgainstMULX(t *testing.T) {
defer recognizecpu()
if !(cpu.HasADX && cpu.HasBMI2) {
t.Skip("MULX, ADCX and ADOX not supported by the platform")
}
testRedc(t, kUse_MULXADX, kUse_MULX)
}

+ 1
- 24
p751/arith_decl.go Ver fichero

@@ -4,7 +4,6 @@ package p751

import (
. "github.com/cloudflare/sidh/internal/isogeny"
cpu "github.com/cloudflare/sidh/internal/utils"
)

// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -41,31 +40,9 @@ func fp751Mul(z *FpElementX2, x, y *FpElement)
// fp751MontgomeryReduce implementations below.
// When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p).
// It may destroy the input value.
var fp751MontgomeryReduce func(z *FpElement, x *FpElementX2)

//go:noescape
func fp751MontgomeryReduceBMI2ADX(z *FpElement, x *FpElementX2)

//go:noescape
func fp751MontgomeryReduceBMI2(z *FpElement, x *FpElementX2)

//go:noescape
func fp751MontgomeryReduceFallback(z *FpElement, x *FpElementX2)
func fp751MontgomeryReduce(z *FpElement, x *FpElementX2)

// Reduce a field element in [0, 2*p) to one in [0,p).
//go:noescape
func fp751StrongReduce(x *FpElement)

// On initialization, set the fp751MontgomeryReduce function pointer to the
// fastest implementation depending on CPU capabilities.
func init() {
if cpu.HasBMI2 {
if cpu.HasADX {
fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX
} else {
fp751MontgomeryReduce = fp751MontgomeryReduceBMI2
}
} else {
fp751MontgomeryReduce = fp751MontgomeryReduceFallback
}
}

Cargando…
Cancelar
Guardar