Browse Source

p751: add some more comments to clarify assembly code

trials/PERF
Ko- 6 years ago
parent
commit
ade27e3126
2 changed files with 24 additions and 3 deletions
  1. +18
    -0
      p751toolbox/field_amd64.s
  2. +6
    -3
      p751toolbox/field_decl.go

+ 18
- 0
p751toolbox/field_amd64.s View File

@@ -1417,6 +1417,11 @@ TEXT ·fp751Mul(SB), $96-24

RET

// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 and ADX instruction set extensions
// are available.
#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
@@ -1516,6 +1521,11 @@ TEXT ·fp751Mul(SB), $96-24
ADOXQ DX, T2 \
ADOXQ AX, T4

// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 instruction set extension is
// available.
#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
@@ -1738,6 +1748,9 @@ TEXT ·fp751Mul(SB), $96-24
MOVQ T5, 80+C \
MOVQ T6, 88+C

// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 and ADX instruction set extensions are available.
TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
@@ -1759,6 +1772,9 @@ TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16

RET

// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 instruction set extension is available.
TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
@@ -1780,6 +1796,8 @@ TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16

RET

// This implements the straightforward Montgomery reduction algorithm without
// using specific instruction set extensions.
TEXT ·fp751MontgomeryReduceFallback(SB), $0-16

MOVQ z+0(FP), REG_P2


+ 6
- 3
p751toolbox/field_decl.go View File

@@ -36,8 +36,10 @@ func fp751X2SubLazy(z, x, y *fp751X2)
//go:noescape
func fp751Mul(z *fp751X2, x, y *Fp751Element)

// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p).
// Destroys the input value.
// Function pointer that should point to one of the
// fp751MontgomeryReduce implementations below.
// When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p).
// It may destroy the input value.
var fp751MontgomeryReduce func(z *Fp751Element, x *fp751X2)

//go:noescape
@@ -53,7 +55,8 @@ func fp751MontgomeryReduceFallback(z *Fp751Element, x *fp751X2)
//go:noescape
func fp751StrongReduce(x *Fp751Element)

// Choose the fastest variant depending on CPU capabilities.
// On initialization, set the fp751MontgomeryReduce function pointer to the
// fastest implementation depending on CPU capabilities.
func init() {
if cpu.X86.HasBMI2 {
if cpu.X86.HasADX {


Loading…
Cancel
Save