Sfoglia il codice sorgente

p751: speed up montgomery reduction with mulx/adox

In https://eprint.iacr.org/2017/1015.pdf a technique was described to
improve the performance of Montgomery reduction for Montgomery-friendly
moduli. This adds an implementation using the MULX, ADOX and ADCX
instructions, available in the BMI2 (since Haswell) and ADX (since
Broadwell) instruction set extensions.

Implementation uses golang.org/x/sys/cpu for CPU capabilities checking
at runtime.

Development done by: Ko- <ko@cloudflare.com>
trials/PERF
Henry Case 6 anni fa
parent
commit
30563cd6ac
4 ha cambiato i file con 489 aggiunte e 5 eliminazioni
  1. +3
    -1
      Makefile
  2. +382
    -1
      p751toolbox/field_amd64.s
  3. +73
    -0
      p751toolbox/field_amd64_test.go
  4. +31
    -3
      p751toolbox/field_decl.go

+ 3
- 1
Makefile Vedi File

@@ -4,6 +4,7 @@ PRJ_DIR = $(abspath $(dir $(MK_FILE_PATH)))
GOPATH_LOCAL = $(PRJ_DIR)/build GOPATH_LOCAL = $(PRJ_DIR)/build
GOPATH_DIR = github.com/cloudflare/p751sidh GOPATH_DIR = github.com/cloudflare/p751sidh
CSHAKE_PKG ?= github.com/henrydcase/nobs/hash/sha3 CSHAKE_PKG ?= github.com/henrydcase/nobs/hash/sha3
CPU_PKG = golang.org/x/sys/cpu
TARGETS = p751toolbox sidh sike TARGETS = p751toolbox sidh sike
GO ?= go GO ?= go
GOARCH ?= GOARCH ?=
@@ -30,7 +31,7 @@ clean:
rm -rf coverage*.txt rm -rf coverage*.txt


build_env: build_env:
GOPATH=$(GOPATH_LOCAL) $(GO) get $(CSHAKE_PKG)
GOPATH=$(GOPATH_LOCAL) $(GO) get $(CSHAKE_PKG) $(CPU_PKG)
mkdir -p $(GOPATH_LOCAL)/src/$(GOPATH_DIR) mkdir -p $(GOPATH_LOCAL)/src/$(GOPATH_DIR)
cp -rf etc $(GOPATH_LOCAL)/src/$(GOPATH_DIR) cp -rf etc $(GOPATH_LOCAL)/src/$(GOPATH_DIR)


@@ -59,3 +60,4 @@ bench: $(addprefix bench-, $(TARGETS))
cover: $(addprefix cover-, $(TARGETS)) cover: $(addprefix cover-, $(TARGETS))
install: $(addprefix install-, $(TARGETS)) install: $(addprefix install-, $(TARGETS))
test: $(addprefix test-, $(TARGETS)) test: $(addprefix test-, $(TARGETS))


+ 382
- 1
p751toolbox/field_amd64.s Vedi File

@@ -1393,7 +1393,388 @@ TEXT ·fp751Mul(SB), $96-24


RET RET


TEXT ·fp751MontgomeryReduce(SB), $0-16
// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 and ADX instruction set extensions
// are available.
#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
MULXQ M1+48(SB), T3, T2 \
MOVQ T1, 0+C \ // C0_final
XORQ AX, AX \
MULXQ M1+56(SB), T5, T4 \
ADOXQ T3, T0 \
ADOXQ T5, T2 \
MULXQ M1+64(SB), T3, T1 \
ADOXQ T3, T4 \
MULXQ M1+72(SB), T6, T5 \
ADOXQ T6, T1 \
MULXQ M1+80(SB), T7, T3 \
ADOXQ T7, T5 \
MULXQ M1+88(SB), T8, T6 \
ADOXQ T8, T3 \
ADOXQ AX, T6 \
\
MOVQ 8+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
ADCXQ T7, T0 \
MOVQ T0, 8+C \ // C1_final
ADCXQ T8, T2 \
MULXQ M1+48(SB), T8, T7 \
ADOXQ T8, T2 \
ADCXQ T7, T4 \
MULXQ M1+56(SB), T8, T0 \
ADOXQ T8, T4 \
ADCXQ T1, T0 \
MULXQ M1+64(SB), T7, T1 \
ADCXQ T5, T1 \
MULXQ M1+72(SB), T8, T5 \
ADCXQ T5, T3 \
MULXQ M1+80(SB), T9, T5 \
ADCXQ T5, T6 \
MULXQ M1+88(SB), DX, T5 \
ADCXQ AX, T5 \
\
ADOXQ T7, T0 \
ADOXQ T8, T1 \
ADOXQ T9, T3 \
ADOXQ DX, T6 \
ADOXQ AX, T5 \
\
MOVQ 16+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
ADCXQ T7, T2 \
MOVQ T2, 16+C \ // C2_final
ADCXQ T8, T4 \
MULXQ M1+48(SB), T7, T8 \
ADOXQ T7, T4 \
ADCXQ T8, T0 \
MULXQ M1+56(SB), T8, T2 \
ADOXQ T8, T0 \
ADCXQ T2, T1 \
MULXQ M1+64(SB), T7, T2 \
ADCXQ T2, T3 \
MULXQ M1+72(SB), T8, T2 \
ADCXQ T2, T6 \
MULXQ M1+80(SB), T9, T2 \
ADCXQ T2, T5 \
MULXQ M1+88(SB), DX, T2 \
ADCXQ AX, T2 \
\
ADOXQ T7, T1 \
ADOXQ T8, T3 \
ADOXQ T9, T6 \
ADOXQ DX, T5 \
ADOXQ AX, T2 \
\
MOVQ 24+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
ADCXQ T4, T7 \
ADCXQ T8, T0 \
MULXQ M1+48(SB), T10, T8 \
ADOXQ T10, T0 \
ADCXQ T8, T1 \
MULXQ M1+56(SB), T8, T4 \
ADOXQ T8, T1 \
ADCXQ T4, T3 \
MULXQ M1+64(SB), T10, T4 \
ADCXQ T4, T6 \
MULXQ M1+72(SB), T8, T4 \
ADCXQ T4, T5 \
MULXQ M1+80(SB), T9, T4 \
ADCXQ T4, T2 \
MULXQ M1+88(SB), DX, T4 \
ADCXQ AX, T4 \
\
ADOXQ T10, T3 \
ADOXQ T8, T6 \
ADOXQ T9, T5 \
ADOXQ DX, T2 \
ADOXQ AX, T4

// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 instruction set extension is
// available.
#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
MULXQ M1+48(SB), T3, T2 \
MOVQ T1, 0+C \ // C0_final
XORQ AX, AX \
MULXQ M1+56(SB), T5, T4 \
ADDQ T3, T0 \
ADCQ T5, T2 \
MULXQ M1+64(SB), T3, T1 \
ADCQ T3, T4 \
MULXQ M1+72(SB), T6, T5 \
ADCQ T6, T1 \
MULXQ M1+80(SB), T7, T3 \
ADCQ T7, T5 \
MULXQ M1+88(SB), T8, T6 \
ADCQ T8, T3 \
ADCQ AX, T6 \
\
MOVQ 8+M0, DX \
MULXQ M1+40(SB), T7, T8 \
ADDQ T7, T0 \
MOVQ T0, 8+C \ // C1_final
ADCQ T8, T2 \
MULXQ M1+48(SB), T8, T7 \
MOVQ T8, 32+C \
ADCQ T7, T4 \
MULXQ M1+56(SB), T8, T0 \
MOVQ T8, 40+C \
ADCQ T1, T0 \
MULXQ M1+64(SB), T7, T1 \
ADCQ T5, T1 \
MULXQ M1+72(SB), T8, T5 \
ADCQ T5, T3 \
MULXQ M1+80(SB), T9, T5 \
ADCQ T5, T6 \
MULXQ M1+88(SB), DX, T5 \
ADCQ AX, T5 \
\
XORQ AX, AX \
ADDQ 32+C, T2 \
ADCQ 40+C, T4 \
ADCQ T7, T0 \
ADCQ T8, T1 \
ADCQ T9, T3 \
ADCQ DX, T6 \
ADCQ AX, T5 \
\
MOVQ 16+M0, DX \
MULXQ M1+40(SB), T7, T8 \
ADDQ T7, T2 \
MOVQ T2, 16+C \ // C2_final
ADCQ T8, T4 \
MULXQ M1+48(SB), T7, T8 \
MOVQ T7, 32+C \
ADCQ T8, T0 \
MULXQ M1+56(SB), T8, T2 \
MOVQ T8, 40+C \
ADCQ T2, T1 \
MULXQ M1+64(SB), T7, T2 \
ADCQ T2, T3 \
MULXQ M1+72(SB), T8, T2 \
ADCQ T2, T6 \
MULXQ M1+80(SB), T9, T2 \
ADCQ T2, T5 \
MULXQ M1+88(SB), DX, T2 \
ADCQ AX, T2 \
\
XORQ AX, AX \
ADDQ 32+C, T4 \
ADCQ 40+C, T0 \
ADCQ T7, T1 \
ADCQ T8, T3 \
ADCQ T9, T6 \
ADCQ DX, T5 \
ADCQ AX, T2 \
\
MOVQ 24+M0, DX \
MULXQ M1+40(SB), T7, T8 \
ADDQ T4, T7 \
ADCQ T8, T0 \
MULXQ M1+48(SB), T10, T8 \
MOVQ T10, 32+C \
ADCQ T8, T1 \
MULXQ M1+56(SB), T8, T4 \
MOVQ T8, 40+C \
ADCQ T4, T3 \
MULXQ M1+64(SB), T10, T4 \
ADCQ T4, T6 \
MULXQ M1+72(SB), T8, T4 \
ADCQ T4, T5 \
MULXQ M1+80(SB), T9, T4 \
ADCQ T4, T2 \
MULXQ M1+88(SB), DX, T4 \
ADCQ AX, T4 \
\
XORQ AX, AX \
ADDQ 32+C, T0 \
ADCQ 40+C, T1 \
ADCQ T10, T3 \
ADCQ T8, T6 \
ADCQ T9, T5 \
ADCQ DX, T2 \
ADCQ AX, T4

#define fp751MontgomeryReduceCommonPart1(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
XORQ T7, T7 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, T9 \
ADDQ 40+M0, AX \
ADCQ 48+M0, DX \
ADCQ 56+M0, T9 \
MOVQ AX, 40+M0 \
MOVQ DX, 48+M0 \
MOVQ T9, 56+M0 \
ADCQ 64+M0, T8 \
ADCQ 72+M0, T0 \
ADCQ 80+M0, T1 \
ADCQ 88+M0, T2 \
ADCQ 96+M0, T3 \
ADCQ 104+M0, T4 \
ADCQ 112+M0, T5 \
ADCQ 120+M0, T6 \
ADCQ 128+M0, T7 \
MOVQ T8, 64+M0 \
MOVQ T0, 72+M0 \
MOVQ T1, 80+M0 \
MOVQ T2, 88+M0 \
MOVQ T3, 96+M0 \
MOVQ T4, 104+M0 \
MOVQ T5, 112+M0 \
MOVQ T6, 120+M0 \
MOVQ T7, 128+M0 \
MOVQ 136+M0, T0 \
MOVQ 144+M0, T1 \
MOVQ 152+M0, T2 \
MOVQ 160+M0, T3 \
MOVQ 168+M0, T4 \
MOVQ 176+M0, T5 \
MOVQ 184+M0, T6 \
ADCQ $0, T0 \
ADCQ $0, T1 \
ADCQ $0, T2 \
ADCQ $0, T3 \
ADCQ $0, T4 \
ADCQ $0, T5 \
ADCQ $0, T6 \
MOVQ T0, 136+M0 \
MOVQ T1, 144+M0 \
MOVQ T2, 152+M0 \
MOVQ T3, 160+M0 \
MOVQ T4, 168+M0 \
MOVQ T5, 176+M0 \
MOVQ T6, 184+M0

#define fp751MontgomeryReduceCommonPart2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
XORQ T7, T7 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, T9 \
ADDQ 72+M0, AX \
ADCQ 80+M0, DX \
ADCQ 88+M0, T9 \
MOVQ AX, 72+M0 \
MOVQ DX, 80+M0 \
MOVQ T9, 88+M0 \
ADCQ 96+M0, T8 \
ADCQ 104+M0, T0 \
ADCQ 112+M0, T1 \
ADCQ 120+M0, T2 \
ADCQ 128+M0, T3 \
ADCQ 136+M0, T4 \
ADCQ 144+M0, T5 \
ADCQ 152+M0, T6 \
ADCQ 160+M0, T7 \
MOVQ T8, 0+C \ // Final result c0
MOVQ T0, 104+M0 \
MOVQ T1, 112+M0 \
MOVQ T2, 120+M0 \
MOVQ T3, 128+M0 \
MOVQ T4, 136+M0 \
MOVQ T5, 144+M0 \
MOVQ T6, 152+M0 \
MOVQ T7, 160+M0 \
MOVQ 168+M0, T4 \
MOVQ 176+M0, T5 \
MOVQ 184+M0, T6 \
ADCQ $0, T4 \
ADCQ $0, T5 \
ADCQ $0, T6 \
MOVQ T4, 168+M0 \
MOVQ T5, 176+M0 \
MOVQ T6, 184+M0

#define fp751MontgomeryReduceCommonPart3(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
MOVQ 48+C, AX \ // Final result c1:c11
MOVQ 56+C, DX \
MOVQ 64+C, T9 \
ADDQ 104+M0, AX \
ADCQ 112+M0, DX \
ADCQ 120+M0, T9 \
MOVQ AX, 8+C \
MOVQ DX, 16+C \
MOVQ T9, 24+C \
ADCQ 128+M0, T8 \
ADCQ 136+M0, T0 \
ADCQ 144+M0, T1 \
ADCQ 152+M0, T2 \
ADCQ 160+M0, T3 \
ADCQ 168+M0, T4 \
ADCQ 176+M0, T5 \
ADCQ 184+M0, T6 \
MOVQ T8, 32+C \
MOVQ T0, 40+C \
MOVQ T1, 48+C \
MOVQ T2, 56+C \
MOVQ T3, 64+C \
MOVQ T4, 72+C \
MOVQ T5, 80+C \
MOVQ T6, 88+C

// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 and ADX instruction set extensions are available.
TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1

// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2adx(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2adx(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2adx(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

RET

// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 instruction set extension is available.
TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1

// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
mul256x448bmi2(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)

fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)

RET

// This implements the straightforward Montgomery reduction algorithm without
// using specific instruction set extensions.
TEXT ·fp751MontgomeryReduceFallback(SB), $0-16


MOVQ z+0(FP), REG_P2 MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1 MOVQ x+8(FP), REG_P1


+ 73
- 0
p751toolbox/field_amd64_test.go Vedi File

@@ -0,0 +1,73 @@
// +build amd64,!noasm

package p751toolbox

import (
"golang.org/x/sys/cpu"
"testing"
"testing/quick"
)

func TestFp751MontgomeryReduce(t *testing.T) {
// First make sure that at least one value with a known result reduces
// correctly as defined in TestPrimeFieldElementToBigInt.
fp751MontgomeryReduce = fp751MontgomeryReduceFallback
t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)

if !cpu.X86.HasBMI2 {
return
}

fp751MontgomeryReduce = fp751MontgomeryReduceBMI2
t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)

// Also check that the BMI2 implementation produces the same results
// as the fallback implementation.
compareMontgomeryReduce := func(x, y PrimeFieldElement) bool {
var z, zbackup fp751X2
var zred1, zred2 Fp751Element

fp751Mul(&z, &x.A, &y.A)
zbackup = z

fp751MontgomeryReduceFallback(&zred1, &z)
// z may be destroyed.
z = zbackup
fp751MontgomeryReduceBMI2(&zred2, &z)

return zred1 == zred2
}

if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil {
t.Error(err)
}

if !cpu.X86.HasADX {
return
}

fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX
t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)

// Check that the BMI2ADX implementation produces the same results as
// the BMI2 implementation. By transitivity, it should also produce the
// same results as the fallback implementation.
compareMontgomeryReduce = func(x, y PrimeFieldElement) bool {
var z, zbackup fp751X2
var zred1, zred2 Fp751Element

fp751Mul(&z, &x.A, &y.A)
zbackup = z

fp751MontgomeryReduceBMI2(&zred1, &z)
// z may be destroyed.
z = zbackup
fp751MontgomeryReduceBMI2ADX(&zred2, &z)

return zred1 == zred2
}

if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil {
t.Error(err)
}
}

+ 31
- 3
p751toolbox/field_decl.go Vedi File

@@ -2,6 +2,10 @@


package p751toolbox package p751toolbox


import (
"golang.org/x/sys/cpu"
)

// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
// If choice is neither 0 nor 1 then behaviour is undefined. // If choice is neither 0 nor 1 then behaviour is undefined.
// This function executes in constant time. // This function executes in constant time.
@@ -32,11 +36,35 @@ func fp751X2SubLazy(z, x, y *fp751X2)
//go:noescape //go:noescape
func fp751Mul(z *fp751X2, x, y *Fp751Element) func fp751Mul(z *fp751X2, x, y *Fp751Element)


// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p).
// Destroys the input value.
// Function pointer that should point to one of the
// fp751MontgomeryReduce implementations below.
// When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p).
// It may destroy the input value.
var fp751MontgomeryReduce func(z *Fp751Element, x *fp751X2)

//go:noescape
func fp751MontgomeryReduceBMI2ADX(z *Fp751Element, x *fp751X2)

//go:noescape //go:noescape
func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2)
func fp751MontgomeryReduceBMI2(z *Fp751Element, x *fp751X2)

//go:noescape
func fp751MontgomeryReduceFallback(z *Fp751Element, x *fp751X2)


// Reduce a field element in [0, 2*p) to one in [0,p). // Reduce a field element in [0, 2*p) to one in [0,p).
//go:noescape //go:noescape
func fp751StrongReduce(x *Fp751Element) func fp751StrongReduce(x *Fp751Element)

// On initialization, set the fp751MontgomeryReduce function pointer to the
// fastest implementation depending on CPU capabilities.
func init() {
if cpu.X86.HasBMI2 {
if cpu.X86.HasADX {
fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX
} else {
fp751MontgomeryReduce = fp751MontgomeryReduceBMI2
}
} else {
fp751MontgomeryReduce = fp751MontgomeryReduceFallback
}
}

Caricamento…
Annulla
Salva