소스 검색

WIP

WIP2

WIP
trials/PERF_vars
Henry Case 6 년 전
committed by Kris Kwiatkowski
부모
커밋
a3ccc0e275
4개의 변경된 파일81개의 추가작업 그리고 44개의 파일을 삭제
  1. +17
    -0
      p503/arith_amd64.go
  2. +37
    -22
      p503/arith_amd64.s
  3. +7
    -3
      p503/arith_amd64_test.go
  4. +20
    -19
      p503/arith_decl.go

+ 17
- 0
p503/arith_amd64.go 파일 보기

@@ -0,0 +1,17 @@
// +build amd64,!noasm

package p503

import cpu "github.com/cloudflare/p751sidh/internal/utils"

// There couple of reasons for having those variables here:
// 1) to have an access to them from assembly
// 2) to make it easy to vendor the library
// 3) make it possible to test all functionalities
var useMULX bool
var useADXMULX bool

func init() {
useMULX = cpu.HasBMI2
useADXMULX = cpu.HasADX && cpu.HasBMI2
}

+ 37
- 22
p503/arith_amd64.s 파일 보기

@@ -690,23 +690,26 @@ TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24

RET

TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
TEXT ·fp503Mul(SB), NOSPLIT, $104-24
// Actual implementation
MOVQ z+ 0(FP), CX
MOVQ x+ 8(FP), REG_P2
MOVQ y+16(FP), REG_P1
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
RET
MOVQ x+ 8(FP), REG_P1
MOVQ y+16(FP), REG_P2

TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
// Actual implementation
MOVQ z+ 0(FP), CX
MOVQ x+ 8(FP), REG_P2
MOVQ y+16(FP), REG_P1
MUL(CX, REG_P2, REG_P1, MULS256_MULX)
// CMPB ·useMULX(SB), $0
// JE mul
// CMPB ·useMULX(SB), $1
// JE mulx

// MOVB ·useADXMULX(SB), AX
// TESTB AX, AX
// JZ mulx_and_adx

MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
RET

TEXT ·mul(SB), $96-24
mul:
RET
// Uses variant of Karatsuba method.
//
// Here we store the destination in CX instead of in REG_P3 because the
@@ -1188,11 +1191,28 @@ TEXT ·mul(SB), $96-24
ADCQ $0, SI; MOVQ SI, (120)(CX)

RET
// Uses implementation optimized for CPU supporting carry-less
// multiplayer (MULX) and two independent carry-chains (ADOX/ADCX)
mulx_and_adx:
MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
RET
// Uses implementation optimized for CPU supporting carry-less
mulx:
MUL(CX, REG_P1, REG_P2, MULS256_MULX)
RET

TEXT ·redc(SB), $0-16
TEXT ·fp503MontgomeryReduce(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1

REDC(REG_P2, REG_P1, MULS_128x320_MULX)
RET

CMPB ·useADXMULX(SB), $1
JE redc_with_mulx_adx
CMPB ·useMULX(SB), $1
JE redc_with_mulx

MOVQ (REG_P1), R11
MOVQ P503P1_3, AX
MULQ R11
@@ -1497,17 +1517,12 @@ TEXT ·redc(SB), $0-16
MOVQ R10, (56)(REG_P2) // Z7

RET

TEXT ·redcWithMULX(SB), $0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
REDC(DI, SI, MULS_128x320_MULX)
redc_with_mulx:
REDC(REG_P2, REG_P1, MULS_128x320_MULX)
RET

TEXT ·redcWithMULXADX(SB), $0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
REDC(DI, SI, MULS_128x320_MULXADX)
redc_with_mulx_adx:
REDC(REG_P2, REG_P1, MULS_128x320_MULXADX)
RET

TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24


+ 7
- 3
p503/arith_amd64_test.go 파일 보기

@@ -2,14 +2,16 @@

package p503

/*
import (
. "github.com/cloudflare/p751sidh/internal/isogeny"
cpu "github.com/cloudflare/p751sidh/internal/utils"
// cpu "github.com/cloudflare/p751sidh/internal/utils"
"reflect"
"testing"
"testing/quick"
)

*/
/*
// Utility function used for testing Mul implementations. Tests caller provided
// mulFunc against mul()
func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) {
@@ -40,7 +42,8 @@ func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) {
t.Error(err)
}
}

*/
/*
// Ensures corretness of implementation of mul operation which uses MULX
func TestMulWithMULX(t *testing.T) {
if !cpu.HasBMI2 {
@@ -72,3 +75,4 @@ func TestRedcWithMULXADX(t *testing.T) {
}
testRedc(t, redcWithMULXADX)
}
*/

+ 20
- 19
p503/arith_decl.go 파일 보기

@@ -4,7 +4,6 @@ package p503

import (
. "github.com/cloudflare/p751sidh/internal/isogeny"
cpu "github.com/cloudflare/p751sidh/internal/utils"
)

// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -41,24 +40,24 @@ func fp503StrongReduce(x *FpElement)
// Concrete implementation depends on capabilities of the CPU which
// are resolved at runtime. CPUs with ADCX, ADOX and MULX support
// run most optimized implementation
var fp503Mul func(z *FpElementX2, x, y *FpElement)
// Mul implementattion for legacy CPUs
//go:noescape
func mul(z *FpElementX2, x, y *FpElement)
// Mul implementation for CPUs supporting carry-less MULX multiplier.
//go:noescape
func mulWithMULX(z *FpElementX2, x, y *FpElement)
// Mul implementation for CPUs supporting two independent carry chain
// (ADOX/ADCX) instructions and carry-less MULX multiplier
//go:noescape
func mulWithMULXADX(z *FpElementX2, x, y *FpElement)
func fp503Mul(z *FpElementX2, x, y *FpElement)
// // Mul implementattion for legacy CPUs
// //go:noescape
// func mul(z *FpElementX2, x, y *FpElement)
//
// // Mul implementation for CPUs supporting carry-less MULX multiplier.
// //go:noescape
// func mulWithMULX(z *FpElementX2, x, y *FpElement)
//
// // Mul implementation for CPUs supporting two independent carry chain
// // (ADOX/ADCX) instructions and carry-less MULX multiplier
// //go:noescape
// func mulWithMULXADX(z *FpElementX2, x, y *FpElement)

// Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value
// of x may be changed. z=x not allowed.
var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2)
func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)

func redc(z *FpElement, x *FpElementX2)

@@ -71,19 +70,21 @@ func redcWithMULX(z *FpElement, x *FpElementX2)
//go:noescape
func redcWithMULXADX(z *FpElement, x *FpElementX2)

/*
// On initialization, set the fp503Mul function pointer to the
// fastest implementation depending on CPU capabilities.
func init() {
if cpu.HasBMI2 {
if cpu.HasADX {
fp503Mul = mulWithMULXADX
// fp503Mul = mulWithMULXADX
fp503MontgomeryReduce = redcWithMULXADX
} else {
fp503Mul = mulWithMULX
// fp503Mul = mulWithMULX
fp503MontgomeryReduce = redcWithMULX
}
} else {
fp503Mul = mul
// fp503Mul = mul
fp503MontgomeryReduce = redc
}
}
*/

불러오는 중...
취소
저장