Сравнить коммиты

...

3 коммитов

Автор SHA1 Сообщение Дата
  Kris Kwiatkowski 543bbcf84c WIP 6 лет назад
  Kris Kwiatkowski 5bce13a8cc WIP3 6 лет назад
  Kris Kwiatkowski 79fd1f4668 WIP 6 лет назад
4 измененных файлов: 66 добавлений и 21 удалений
  1. +2
    -2
      Makefile
  2. +28
    -0
      p503/arith_amd64.s
  3. +27
    -5
      p503/arith_decl.go
  4. +9
    -14
      p503/field_ops.go

+ 2
- 2
Makefile Просмотреть файл

@@ -12,7 +12,7 @@ OPTS ?=
OPTS_TAGS ?= -tags=noasm
NOASM ?=
# -run="NonExistent" is set to make sure tests are not run before benchmarking
BENCH_OPTS ?= -bench=. -run="NonExistent"
BENCH_OPTS ?= -bench=. -run="NonExistent" -benchmem
# whether to be verbose
V ?= 1

@@ -22,7 +22,7 @@ endif

ifeq ($(V),1)
OPTS += -v # Be verbose
BENCH_OPTS += -gcflags=-m # Show results from inlining
BENCH_OPTS += -gcflags="-m -m" # Show results from inlining
endif

all: test


+ 28
- 0
p503/arith_amd64.s Просмотреть файл

@@ -698,6 +698,29 @@ TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
RET

TEXT ·fp503Mul(SB), NOSPLIT, $104-24
// Actual implementation
MOVQ z+ 0(FP), CX
MOVQ x+ 8(FP), REG_P2
MOVQ y+16(FP), REG_P1
CMPB ·HasBMI2(SB), $1
JE mulWithMULXADX
JMP mulWithMULX
mulWithMULXADX:
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
RET
mulWithMULX:
MUL(CX, REG_P2, REG_P1, MULS256_MULX)
RET

TEXT ·fp503MulXXX(SB), NOSPLIT, $104-72
// Actual implementation
MOVQ z+ 0(FP), CX
MOVQ x+24(FP), REG_P2
MOVQ y+48(FP), REG_P1
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
RET

TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
// Actual implementation
MOVQ z+ 0(FP), CX
@@ -1509,6 +1532,11 @@ TEXT ·redcWithMULXADX(SB), $0-16
MOVQ x+8(FP), SI
REDC(DI, SI, MULS_128x320_MULXADX)
RET
TEXT ·fp503MontgomeryReduce(SB), $0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
REDC(DI, SI, MULS_128x320_MULXADX)
RET

TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24



+ 27
- 5
p503/arith_decl.go Просмотреть файл

@@ -41,7 +41,8 @@ func fp503StrongReduce(x *FpElement)
// Concrete implementation depends on capabilities of the CPU which
// are resolved at runtime. CPUs with ADCX, ADOX and MULX support
// run most optimized implementation
var fp503Mul func(z *FpElementX2, x, y *FpElement)
//go:noescape
// var fp503Mul func(z *FpElementX2, x, y *FpElement)

// Mul implementattion for legacy CPUs
//go:noescape
@@ -53,12 +54,21 @@ func mulWithMULX(z *FpElementX2, x, y *FpElement)

// Mul implementation for CPUs supporting two independent carry chain
// (ADOX/ADCX) instructions and carry-less MULX multiplier
//go:noescape
func fp503Mul(z *FpElementX2, x, y *FpElement)

//go:noescape
func fp503MulXXX(z, x, y []uint64)

var fp503Mul1 func(z, x, y []uint64)

//go:noescape
func mulWithMULXADX(z *FpElementX2, x, y *FpElement)

// Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value
// of x may be changed. z=x not allowed.
var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2)
//go:noescape
func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)

func redc(z *FpElement, x *FpElementX2)

@@ -73,17 +83,29 @@ func redcWithMULXADX(z *FpElement, x *FpElementX2)

// On initialization, set the fp503Mul function pointer to the
// fastest implementation depending on CPU capabilities.
func init() {
fp503Mul1 = fp503MulXXX
}

var HasBMI2 bool

func init() {
HasBMI2 = cpu.HasBMI2
}

/*
func init() {
if cpu.HasBMI2 {
if cpu.HasADX {
fp503Mul = mulWithMULXADX
//fp503Mul = mulWithMULXADX
fp503MontgomeryReduce = redcWithMULXADX
} else {
fp503Mul = mulWithMULX
//fp503Mul = mulWithMULX
fp503MontgomeryReduce = redcWithMULX
}
} else {
fp503Mul = mul
//fp503Mul = mul
fp503MontgomeryReduce = redc
}
}
*/

+ 9
- 14
p503/field_ops.go Просмотреть файл

@@ -21,13 +21,12 @@ func (fp503Ops) Sub(dest, lhs, rhs *Fp2Element) {
}

func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) {
// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b).
a := &lhs.A
b := &lhs.B
c := &rhs.A
d := &rhs.B
var b_minus_a, c_minus_d FpElement
var ad_plus_bc FpElementX2
var ac, bd FpElementX2
var ac_minus_bd FpElementX2

// We want to compute
// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). We want to compute
//
// (a + bi)*(c + di) = (a*c - b*d) + (a*d + b*c)i
//
@@ -37,22 +36,18 @@ func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) {
//
// so (a*d + b*c) = (b-a)*(c-d) + a*c + b*d.

var ac, bd FpElementX2
fp503Mul(&ac, a, c) // = a*c*R*R
fp503Mul(&bd, b, d) // = b*d*R*R
fp503Mul(&ac, &lhs.A, &rhs.A) // = a*c*R*R
fp503Mul(&bd, &lhs.B, &rhs.B) // = b*d*R*R

var b_minus_a, c_minus_d FpElement
fp503SubReduced(&b_minus_a, b, a) // = (b-a)*R
fp503SubReduced(&c_minus_d, c, d) // = (c-d)*R
fp503SubReduced(&b_minus_a, &lhs.B, &lhs.A) // = (b-a)*R
fp503SubReduced(&c_minus_d, &rhs.A, &rhs.B) // = (c-d)*R

var ad_plus_bc FpElementX2
fp503Mul(&ad_plus_bc, &b_minus_a, &c_minus_d) // = (b-a)*(c-d)*R*R
fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &ac) // = ((b-a)*(c-d) + a*c)*R*R
fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &bd) // = ((b-a)*(c-d) + a*c + b*d)*R*R

fp503MontgomeryReduce(&dest.B, &ad_plus_bc) // = (a*d + b*c)*R mod p

var ac_minus_bd FpElementX2
fp503X2SubLazy(&ac_minus_bd, &ac, &bd) // = (a*c - b*d)*R*R
fp503MontgomeryReduce(&dest.A, &ac_minus_bd) // = (a*c - b*d)*R mod p
}


Загрузка…
Отмена
Сохранить