作者 | SHA1 | 备注 | 提交日期 |
---|---|---|---|
Henry Case | 39ead4ce25 | WIP | 6 年前 |
Henry Case | c9349da3b6 |
fix: fixes vendoring issue
Reported: https://github.com/cloudflare/sidh/issues/21 See also: https://github.com/golang/go/issues/28230 |
6 年前 |
Henry Case | 137b47345f | makefile: when vendoring consider only .s and .go files | 6 年前 |
Henry Case | 7b83e32ecd | vendor: when vendoring also U+00B7 needs to be changed to _ | 6 年前 |
Henry Case | 5af598dcb3 | sidh: improve comment for ARM's redc implementation | 6 年前 |
Ko- | 67c90d155e |
Add field arithmetic in assembly for arm64 for p503 and p751 (#10)
sidh: ARM64 implementation * p503 and p751 implementation with two level Karatsuba Benchmarks: benchmark old ns/op new ns/op delta BenchmarkFp2ElementMul 5627 387 -93.12% BenchmarkFp2ElementInv 1272612 80818 -93.65% BenchmarkFp2ElementSquare 4168 314 -92.47% BenchmarkFp2ElementAdd 141 46.2 -67.23% BenchmarkFp2ElementSub 107 40.9 -61.78% BenchmarkPrimeFieldElementMul 2081 125 -93.99% BenchmarkFp503Multiply 1218 63.0 -94.83% BenchmarkFp503MontgomeryReduce 856 51.6 -93.97% BenchmarkFp503AddReduced 59.1 14.6 -75.30% BenchmarkFp503SubReduced 42.9 10.6 -75.29% BenchmarkFp503ConditionalSwap 37.3 18.1 -51.47% BenchmarkFp503StrongReduce 44.2 9.66 -78.14% BenchmarkFp503AddLazy 20.8 8.40 -59.62% BenchmarkFp503X2AddLazy 36.5 12.5 -65.75% BenchmarkFp503X2SubLazy 67.5 13.8 -79.56% BenchmarkThreePointLadder255BitScalar 16388474 1443781 -91.19% BenchmarkFp2ElementMul 12046 698 -94.21% BenchmarkFp2ElementInv 3927356 220699 -94.38% BenchmarkFp2ElementSquare 8992 559 -93.78% BenchmarkFp2ElementAdd 193 63.2 -67.25% BenchmarkFp2ElementSub 143 49.1 -65.66% BenchmarkPrimeFieldElementMul 4476 236 -94.73% BenchmarkFp751Multiply 2740 125 -95.44% BenchmarkFp751MontgomeryReduce 1738 101 -94.19% BenchmarkFp751AddReduced 85.2 19.9 -76.64% BenchmarkFp751SubReduced 62.0 17.2 -72.26% BenchmarkFp751ConditionalSwap 52.6 22.2 -57.79% BenchmarkFp751StrongReduce 61.9 13.3 -78.51% BenchmarkFp751AddLazy 28.7 10.9 -62.02% BenchmarkFp751X2AddLazy 52.2 15.8 -69.73% BenchmarkFp751X2SubLazy 97.6 20.6 -78.89% BenchmarkThreePointLadder379BitScalar 52570696 3571511 -93.21% BenchmarkR2L379BitScalar 52421015 3571497 -93.19% BenchmarkSidhKeyAgreementP751 482289441 32306099 -93.30% BenchmarkSidhKeyAgreementP503 144369772 12319167 -91.47% BenchmarkAliceKeyGenPrvP751 1428 1419 -0.63% BenchmarkAliceKeyGenPrvP503 1214 1200 -1.15% BenchmarkBobKeyGenPrvP751 1433 1428 -0.35% BenchmarkBobKeyGenPrvP503 1215 1204 -0.91% BenchmarkAliceKeyGenPubP751 270559528 17921608 -93.38% BenchmarkAliceKeyGenPubP503 82615895 6880817 -91.67% BenchmarkBobKeyGenPubP751 303556706 20901082 -93.11% BenchmarkBobKeyGenPubP503 91116505 8152519 -91.05% BenchmarkSharedSecretAliceP751 223935926 14567056 -93.49% BenchmarkSharedSecretAliceP503 67340445 5518566 -91.80% BenchmarkSharedSecretBobP751 259148479 17659648 -93.19% BenchmarkSharedSecretBobP503 76827887 6766748 -91.19% |
6 年前 |
@@ -43,11 +43,11 @@ copy-target-%: | |||
prep_targets: build_env $(addprefix copy-target-, $(TARGETS)) | |||
install-%: prep_targets | |||
GOPATH=$(GOPATH_LOCAL) $(GO) install $(OPTS) $(GOPATH_DIR)/$* | |||
GOPATH=$(GOPATH_LOCAL) GOARCH=$(GOARCH) $(GO) install $(OPTS) $(GOPATH_DIR)/$* | |||
test-%: prep_targets | |||
GOPATH=$(GOPATH_LOCAL) $(GO) vet $(GOPATH_DIR)/$* | |||
GOPATH=$(GOPATH_LOCAL) $(GO) test $(OPTS) $(GOPATH_DIR)/$* | |||
GOPATH=$(GOPATH_LOCAL) GOARCH=$(GOARCH) $(GO) test $(OPTS) $(GOPATH_DIR)/$* | |||
bench-%: prep_targets | |||
GOMAXPROCS=1 GOPATH=$(GOPATH_LOCAL) $(GO) test $(OPTS) $(GOPATH_DIR)/$* $(BENCH_OPTS) | |||
@@ -58,6 +58,7 @@ cover-%: prep_targets | |||
cat coverage_$*.txt >> coverage.txt | |||
rm coverage_$*.txt | |||
# This is a target used when vendoring to standard library | |||
vendor: clean | |||
mkdir -p $(VENDOR_DIR)/github_com/cloudflare/sidh/ | |||
rsync -a . $(VENDOR_DIR)/github_com/cloudflare/sidh/ \ | |||
@@ -67,8 +68,20 @@ vendor: clean | |||
--exclude=README.md \ | |||
--exclude=Makefile \ | |||
--exclude=build | |||
# This swaps all imports with github.com to github_com, so that standard library doesn't | |||
# try to access external libraries. | |||
find $(VENDOR_DIR) -type f -iname "*.go" -print0 | xargs -0 sed -i 's/github\.com/github_com/g' | |||
find $(VENDOR_DIR) -type f -print0 -name "*.go" | xargs -0 sed -i 's/github\.com/github_com/g' | |||
# Target used when vendoring a package | |||
vendor-package: clean | |||
mkdir -p $(VENDOR_DIR)/github.com/cloudflare/sidh/ | |||
rsync -a . $(VENDOR_DIR)/github.com/cloudflare/sidh/ \ | |||
--exclude=$(VENDOR_DIR) \ | |||
--exclude=.git \ | |||
--exclude=.travis.yml \ | |||
--exclude=README.md \ | |||
--exclude=Makefile \ | |||
--exclude=build | |||
bench: $(addprefix bench-, $(TARGETS)) | |||
cover: $(addprefix cover-, $(TARGETS)) | |||
@@ -1,4 +1,4 @@ | |||
// +build amd64, !noasm | |||
// +build amd64,!noasm | |||
// Sets capabilities flags for x86 according to information received from | |||
// CPUID. It was written in accordance with | |||
@@ -10,8 +10,8 @@ package utils | |||
// Signals support for MULX which is in BMI2 | |||
var HasBMI2 bool | |||
// Signals support for ADX and BMI2 | |||
var HasADXandBMI2 bool | |||
// Signals support for ADX | |||
var HasADX bool | |||
// Performs CPUID and returns values of registers | |||
// go:nosplit | |||
@@ -31,7 +31,7 @@ func RecognizeCpu() { | |||
_, ebx, _, _ := cpuid(7, 0) | |||
HasBMI2 = bitn(ebx, 19) | |||
HasADXandBMI2 = bitn(ebx, 7) && HasBMI2 | |||
HasADX = bitn(ebx, 7) | |||
} | |||
func init() { | |||
@@ -0,0 +1,16 @@ | |||
// +build amd64,!noasm | |||
package p503 | |||
import cpu "github.com/cloudflare/sidh/internal/utils" | |||
// Signals support for MULX which is in BMI2 | |||
var HasBMI2 bool | |||
// Signals support for ADX and BMI2 | |||
var HasADXandBMI2 bool | |||
func init() { | |||
HasBMI2 = cpu.HasBMI2 | |||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||
} |
@@ -697,9 +697,9 @@ TEXT ·fp503Mul(SB), NOSPLIT, $104-24 | |||
MOVQ y+16(FP), REG_P2 | |||
// Check wether to use optimized implementation | |||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | |||
CMPB ·HasADXandBMI2(SB), $1 | |||
JE mul_with_mulx_adcx_adox | |||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | |||
CMPB ·HasBMI2(SB), $1 | |||
JE mul_with_mulx | |||
// Generic x86 implementation (below) uses variant of Karatsuba method. | |||
@@ -1194,9 +1194,9 @@ TEXT ·fp503MontgomeryReduce(SB), $0-16 | |||
MOVQ x+8(FP), REG_P1 | |||
// Check wether to use optimized implementation | |||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | |||
CMPB ·HasADXandBMI2(SB), $1 | |||
JE redc_with_mulx_adcx_adox | |||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | |||
CMPB ·HasBMI2(SB), $1 | |||
JE redc_with_mulx | |||
MOVQ (REG_P1), R11 | |||
@@ -21,21 +21,26 @@ const ( | |||
kUse_MULXandADxX = 1 << 2 | |||
) | |||
func resetCpuFeatures() { | |||
HasBMI2 = cpu.HasBMI2 | |||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||
} | |||
// Utility function used for testing Mul implementations. Tests caller provided | |||
// mulFunc against mul() | |||
func testMul(t *testing.T, f1, f2 OptimFlag) { | |||
doMulTest := func(multiplier, multiplicant FpElement) bool { | |||
defer cpu.RecognizeCpu() | |||
defer resetCpuFeatures() | |||
var resMulRef, resMulOptim FpElementX2 | |||
// Compute multiplier*multiplicant with first implementation | |||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||
HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||
HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||
fp503Mul(&resMulOptim, &multiplier, &multiplicant) | |||
// Compute multiplier*multiplicant with second implementation | |||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||
HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||
HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||
fp503Mul(&resMulRef, &multiplier, &multiplicant) | |||
// Compare results | |||
@@ -51,18 +56,18 @@ func testMul(t *testing.T, f1, f2 OptimFlag) { | |||
// redcFunc against redc() | |||
func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||
doRedcTest := func(aRR FpElementX2) bool { | |||
defer cpu.RecognizeCpu() | |||
defer resetCpuFeatures() | |||
var resRedcF1, resRedcF2 FpElement | |||
var aRRcpy = aRR | |||
// Compute redc with first implementation | |||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||
HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||
HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||
fp503MontgomeryReduce(&resRedcF1, &aRR) | |||
// Compute redc with second implementation | |||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||
HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||
HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||
fp503MontgomeryReduce(&resRedcF2, &aRRcpy) | |||
// Compare results | |||
@@ -76,8 +81,8 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||
// Ensures correctness of implementation of mul operation which uses MULX | |||
func TestMulWithMULX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasBMI2 { | |||
t.Skip("MULX not supported by the platform") | |||
} | |||
testMul(t, kUse_MULX, kUse_MUL) | |||
@@ -85,8 +90,8 @@ func TestMulWithMULX(t *testing.T) { | |||
// Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | |||
func TestMulWithMULXADxX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasADXandBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasADXandBMI2 { | |||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | |||
} | |||
testMul(t, kUse_MULXandADxX, kUse_MUL) | |||
@@ -94,8 +99,8 @@ func TestMulWithMULXADxX(t *testing.T) { | |||
// Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | |||
func TestMulWithMULXADxXAgainstMULX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasADXandBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasADXandBMI2 { | |||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | |||
} | |||
testMul(t, kUse_MULX, kUse_MULXandADxX) | |||
@@ -103,8 +108,8 @@ func TestMulWithMULXADxXAgainstMULX(t *testing.T) { | |||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||
func TestRedcWithMULX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasBMI2 { | |||
t.Skip("MULX not supported by the platform") | |||
} | |||
testRedc(t, kUse_MULX, kUse_MUL) | |||
@@ -113,8 +118,8 @@ func TestRedcWithMULX(t *testing.T) { | |||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||
// and ADCX/ADOX. | |||
func TestRedcWithMULXADxX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasADXandBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasADXandBMI2 { | |||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | |||
} | |||
testRedc(t, kUse_MULXandADxX, kUse_MUL) | |||
@@ -123,8 +128,8 @@ func TestRedcWithMULXADxX(t *testing.T) { | |||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||
// and ADCX/ADOX. | |||
func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasADXandBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasADXandBMI2 { | |||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | |||
} | |||
testRedc(t, kUse_MULXandADxX, kUse_MULX) | |||
@@ -0,0 +1,802 @@ | |||
// +build arm64,!noasm | |||
#include "textflag.h" | |||
TEXT ·fp503ConditionalSwap(SB), NOSPLIT, $0-17 | |||
MOVD x+0(FP), R0 | |||
MOVD y+8(FP), R1 | |||
MOVB choice+16(FP), R2 | |||
// Set flags | |||
// If choice is not 0 or 1, this implementation will swap completely | |||
CMP $0, R2 | |||
LDP 0(R0), (R3, R4) | |||
LDP 0(R1), (R5, R6) | |||
CSEL EQ, R3, R5, R7 | |||
CSEL EQ, R4, R6, R8 | |||
STP (R7, R8), 0(R0) | |||
CSEL NE, R3, R5, R9 | |||
CSEL NE, R4, R6, R10 | |||
STP (R9, R10), 0(R1) | |||
LDP 16(R0), (R3, R4) | |||
LDP 16(R1), (R5, R6) | |||
CSEL EQ, R3, R5, R7 | |||
CSEL EQ, R4, R6, R8 | |||
STP (R7, R8), 16(R0) | |||
CSEL NE, R3, R5, R9 | |||
CSEL NE, R4, R6, R10 | |||
STP (R9, R10), 16(R1) | |||
LDP 32(R0), (R3, R4) | |||
LDP 32(R1), (R5, R6) | |||
CSEL EQ, R3, R5, R7 | |||
CSEL EQ, R4, R6, R8 | |||
STP (R7, R8), 32(R0) | |||
CSEL NE, R3, R5, R9 | |||
CSEL NE, R4, R6, R10 | |||
STP (R9, R10), 32(R1) | |||
LDP 48(R0), (R3, R4) | |||
LDP 48(R1), (R5, R6) | |||
CSEL EQ, R3, R5, R7 | |||
CSEL EQ, R4, R6, R8 | |||
STP (R7, R8), 48(R0) | |||
CSEL NE, R3, R5, R9 | |||
CSEL NE, R4, R6, R10 | |||
STP (R9, R10), 48(R1) | |||
RET | |||
TEXT ·fp503AddReduced(SB), NOSPLIT, $0-24 | |||
MOVD z+0(FP), R2 | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
// Load first summand into R3-R10 | |||
// Add first summand and second summand and store result in R3-R10 | |||
LDP 0(R0), (R3, R4) | |||
LDP 0(R1), (R11, R12) | |||
LDP 16(R0), (R5, R6) | |||
LDP 16(R1), (R13, R14) | |||
ADDS R11, R3 | |||
ADCS R12, R4 | |||
ADCS R13, R5 | |||
ADCS R14, R6 | |||
LDP 32(R0), (R7, R8) | |||
LDP 32(R1), (R11, R12) | |||
LDP 48(R0), (R9, R10) | |||
LDP 48(R1), (R13, R14) | |||
ADCS R11, R7 | |||
ADCS R12, R8 | |||
ADCS R13, R9 | |||
ADC R14, R10 | |||
// Subtract 2 * p503 in R11-R17 from the result in R3-R10 | |||
LDP ·p503x2+0(SB), (R11, R12) | |||
LDP ·p503x2+24(SB), (R13, R14) | |||
SUBS R11, R3 | |||
SBCS R12, R4 | |||
LDP ·p503x2+40(SB), (R15, R16) | |||
SBCS R12, R5 | |||
SBCS R13, R6 | |||
MOVD ·p503x2+56(SB), R17 | |||
SBCS R14, R7 | |||
SBCS R15, R8 | |||
SBCS R16, R9 | |||
SBCS R17, R10 | |||
SBC ZR, ZR, R19 | |||
// If x + y - 2 * p503 < 0, R19 is 1 and 2 * p503 should be added | |||
AND R19, R11 | |||
AND R19, R12 | |||
AND R19, R13 | |||
AND R19, R14 | |||
AND R19, R15 | |||
AND R19, R16 | |||
AND R19, R17 | |||
ADDS R11, R3 | |||
ADCS R12, R4 | |||
STP (R3, R4), 0(R2) | |||
ADCS R12, R5 | |||
ADCS R13, R6 | |||
STP (R5, R6), 16(R2) | |||
ADCS R14, R7 | |||
ADCS R15, R8 | |||
STP (R7, R8), 32(R2) | |||
ADCS R16, R9 | |||
ADC R17, R10 | |||
STP (R9, R10), 48(R2) | |||
RET | |||
TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24 | |||
MOVD z+0(FP), R2 | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
// Load x into R3-R10 | |||
// Subtract y from x and store result in R3-R10 | |||
LDP 0(R0), (R3, R4) | |||
LDP 0(R1), (R11, R12) | |||
LDP 16(R0), (R5, R6) | |||
LDP 16(R1), (R13, R14) | |||
SUBS R11, R3 | |||
SBCS R12, R4 | |||
SBCS R13, R5 | |||
SBCS R14, R6 | |||
LDP 32(R0), (R7, R8) | |||
LDP 32(R1), (R11, R12) | |||
LDP 48(R0), (R9, R10) | |||
LDP 48(R1), (R13, R14) | |||
SBCS R11, R7 | |||
SBCS R12, R8 | |||
SBCS R13, R9 | |||
SBCS R14, R10 | |||
SBC ZR, ZR, R19 | |||
// If x - y < 0, R19 is 1 and 2 * p503 should be added | |||
LDP ·p503x2+0(SB), (R11, R12) | |||
LDP ·p503x2+24(SB), (R13, R14) | |||
AND R19, R11 | |||
AND R19, R12 | |||
LDP ·p503x2+40(SB), (R15, R16) | |||
AND R19, R13 | |||
AND R19, R14 | |||
MOVD ·p503x2+56(SB), R17 | |||
AND R19, R15 | |||
AND R19, R16 | |||
AND R19, R17 | |||
ADDS R11, R3 | |||
ADCS R12, R4 | |||
STP (R3, R4), 0(R2) | |||
ADCS R12, R5 | |||
ADCS R13, R6 | |||
STP (R5, R6), 16(R2) | |||
ADCS R14, R7 | |||
ADCS R15, R8 | |||
STP (R7, R8), 32(R2) | |||
ADCS R16, R9 | |||
ADC R17, R10 | |||
STP (R9, R10), 48(R2) | |||
RET | |||
TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 | |||
MOVD z+0(FP), R2 | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
// Load first summand into R3-R10 | |||
// Add first summand and second summand and store result in R3-R10 | |||
LDP 0(R0), (R3, R4) | |||
LDP 0(R1), (R11, R12) | |||
LDP 16(R0), (R5, R6) | |||
LDP 16(R1), (R13, R14) | |||
ADDS R11, R3 | |||
ADCS R12, R4 | |||
STP (R3, R4), 0(R2) | |||
ADCS R13, R5 | |||
ADCS R14, R6 | |||
STP (R5, R6), 16(R2) | |||
LDP 32(R0), (R7, R8) | |||
LDP 32(R1), (R11, R12) | |||
LDP 48(R0), (R9, R10) | |||
LDP 48(R1), (R13, R14) | |||
ADCS R11, R7 | |||
ADCS R12, R8 | |||
STP (R7, R8), 32(R2) | |||
ADCS R13, R9 | |||
ADC R14, R10 | |||
STP (R9, R10), 48(R2) | |||
RET | |||
TEXT ·fp503X2AddLazy(SB), NOSPLIT, $0-24 | |||
MOVD z+0(FP), R2 | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
LDP 0(R0), (R3, R4) | |||
LDP 0(R1), (R11, R12) | |||
LDP 16(R0), (R5, R6) | |||
LDP 16(R1), (R13, R14) | |||
ADDS R11, R3 | |||
ADCS R12, R4 | |||
STP (R3, R4), 0(R2) | |||
ADCS R13, R5 | |||
ADCS R14, R6 | |||
STP (R5, R6), 16(R2) | |||
LDP 32(R0), (R7, R8) | |||
LDP 32(R1), (R11, R12) | |||
LDP 48(R0), (R9, R10) | |||
LDP 48(R1), (R13, R14) | |||
ADCS R11, R7 | |||
ADCS R12, R8 | |||
STP (R7, R8), 32(R2) | |||
ADCS R13, R9 | |||
ADCS R14, R10 | |||
STP (R9, R10), 48(R2) | |||
LDP 64(R0), (R3, R4) | |||
LDP 64(R1), (R11, R12) | |||
LDP 80(R0), (R5, R6) | |||
LDP 80(R1), (R13, R14) | |||
ADCS R11, R3 | |||
ADCS R12, R4 | |||
STP (R3, R4), 64(R2) | |||
ADCS R13, R5 | |||
ADCS R14, R6 | |||
STP (R5, R6), 80(R2) | |||
LDP 96(R0), (R7, R8) | |||
LDP 96(R1), (R11, R12) | |||
LDP 112(R0), (R9, R10) | |||
LDP 112(R1), (R13, R14) | |||
ADCS R11, R7 | |||
ADCS R12, R8 | |||
STP (R7, R8), 96(R2) | |||
ADCS R13, R9 | |||
ADC R14, R10 | |||
STP (R9, R10), 112(R2) | |||
RET | |||
TEXT ·fp503X2SubLazy(SB), NOSPLIT, $0-24 | |||
MOVD z+0(FP), R2 | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
LDP 0(R0), (R3, R4) | |||
LDP 0(R1), (R11, R12) | |||
LDP 16(R0), (R5, R6) | |||
LDP 16(R1), (R13, R14) | |||
SUBS R11, R3 | |||
SBCS R12, R4 | |||
STP (R3, R4), 0(R2) | |||
SBCS R13, R5 | |||
SBCS R14, R6 | |||
STP (R5, R6), 16(R2) | |||
LDP 32(R0), (R7, R8) | |||
LDP 32(R1), (R11, R12) | |||
LDP 48(R0), (R9, R10) | |||
LDP 48(R1), (R13, R14) | |||
SBCS R11, R7 | |||
SBCS R12, R8 | |||
STP (R7, R8), 32(R2) | |||
SBCS R13, R9 | |||
SBCS R14, R10 | |||
STP (R9, R10), 48(R2) | |||
LDP 64(R0), (R3, R4) | |||
LDP 64(R1), (R11, R12) | |||
LDP 80(R0), (R5, R6) | |||
LDP 80(R1), (R13, R14) | |||
SBCS R11, R3 | |||
SBCS R12, R4 | |||
SBCS R13, R5 | |||
SBCS R14, R6 | |||
LDP 96(R0), (R7, R8) | |||
LDP 96(R1), (R11, R12) | |||
LDP 112(R0), (R9, R10) | |||
LDP 112(R1), (R13, R14) | |||
SBCS R11, R7 | |||
SBCS R12, R8 | |||
SBCS R13, R9 | |||
SBCS R14, R10 | |||
SBC ZR, ZR, R15 | |||
// If x - y < 0, R15 is 1 and p503 should be added | |||
LDP ·p503+16(SB), (R16, R17) | |||
LDP ·p503+32(SB), (R19, R20) | |||
AND R15, R16 | |||
AND R15, R17 | |||
LDP ·p503+48(SB), (R21, R22) | |||
AND R15, R19 | |||
AND R15, R20 | |||
AND R15, R21 | |||
AND R15, R22 | |||
ADDS R16, R3 | |||
ADCS R16, R4 | |||
STP (R3, R4), 64(R2) | |||
ADCS R16, R5 | |||
ADCS R17, R6 | |||
STP (R5, R6), 80(R2) | |||
ADCS R19, R7 | |||
ADCS R20, R8 | |||
STP (R7, R8), 96(R2) | |||
ADCS R21, R9 | |||
ADC R22, R10 | |||
STP (R9, R10), 112(R2) | |||
RET | |||
// Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) | |||
// Z0 is not actually touched | |||
// Result of (X0-X1) * (Y0-Y1) will be in Z0-Z3 | |||
// Inputs get overwritten, except for X1 | |||
#define mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, Z2, Z3, T0) \ | |||
MUL X1, Y0, X0 \ | |||
UMULH X1, Y0, Y0 \ | |||
ADDS Z3, Z1 \ | |||
ADC ZR, Z2 \ | |||
\ | |||
MUL Y1, X1, T0 \ | |||
UMULH Y1, X1, Y1 \ | |||
ADDS X0, Z1 \ | |||
ADCS Y0, Z2 \ | |||
ADC ZR, ZR, Z3 \ | |||
\ | |||
ADDS T0, Z2 \ | |||
ADC Y1, Z3 | |||
// Expects that X points to (X0-X1) | |||
// Result of (X0-X3) * (Y0-Y3) will be in Z0-Z7 | |||
// Inputs get overwritten, except X2-X3 and Y2-Y3 | |||
#define mul256x256karatsuba(X, X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1)\ | |||
ADDS X2, X0 \ // xH + xL, destroys xL | |||
ADCS X3, X1 \ | |||
ADCS ZR, ZR, T0 \ | |||
\ | |||
ADDS Y2, Y0, Z6 \ // yH + yL | |||
ADCS Y3, Y1, T1 \ | |||
ADC ZR, ZR, Z7 \ | |||
\ | |||
SUB T0, ZR, Z2 \ | |||
SUB Z7, ZR, Z3 \ | |||
AND Z7, T0 \ // combined carry | |||
\ | |||
AND Z2, Z6, Z0 \ // masked(yH + yL) | |||
AND Z2, T1, Z1 \ | |||
\ | |||
AND Z3, X0, Z4 \ // masked(xH + xL) | |||
AND Z3, X1, Z5 \ | |||
\ | |||
MUL Z6, X0, Z2 \ | |||
MUL T1, X0, Z3 \ | |||
\ | |||
ADDS Z4, Z0 \ | |||
UMULH T1, X0, Z4 \ | |||
ADCS Z5, Z1 \ | |||
UMULH Z6, X0, Z5 \ | |||
ADC ZR, T0 \ | |||
\ // (xH + xL) * (yH + yL) | |||
mul128x128comba(X0, X1, Z6, T1, Z2, Z3, Z4, Z5, Z7)\ | |||
\ | |||
LDP 0+X, (X0, X1) \ | |||
\ | |||
ADDS Z0, Z4 \ | |||
UMULH Y0, X0, Z7 \ | |||
UMULH Y1, X0, T1 \ | |||
ADCS Z1, Z5 \ | |||
MUL Y0, X0, Z0 \ | |||
MUL Y1, X0, Z1 \ | |||
ADC ZR, T0 \ | |||
\ // xL * yL | |||
mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, T1, Z7, Z6)\ | |||
\ | |||
MUL Y2, X2, X0 \ | |||
UMULH Y2, X2, Y0 \ | |||
SUBS Z0, Z2 \ // (xH + xL) * (yH + yL) - xL * yL | |||
SBCS Z1, Z3 \ | |||
SBCS T1, Z4 \ | |||
MUL Y3, X2, X1 \ | |||
UMULH Y3, X2, Z6 \ | |||
SBCS Z7, Z5 \ | |||
SBCS ZR, T0 \ | |||
\ // xH * yH | |||
mul128x128comba(X2, X3, Y2, Y3, X0, X1, Z6, Y0, Y1)\ | |||
\ | |||
SUBS X0, Z2 \ // (xH + xL) * (yH + yL) - xL * yL - xH * yH | |||
SBCS X1, Z3 \ | |||
SBCS Z6, Z4 \ | |||
SBCS Y0, Z5 \ | |||
SBCS ZR, T0 \ | |||
\ | |||
ADDS T1, Z2 \ // (xH * yH) * 2^256 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^128 + xL * yL | |||
ADCS Z7, Z3 \ | |||
ADCS X0, Z4 \ | |||
ADCS X1, Z5 \ | |||
ADCS T0, Z6 \ | |||
ADC Y0, ZR, Z7 | |||
// This implements two-level Karatsuba with a 128x128 Comba multiplier | |||
// at the bottom | |||
TEXT ·fp503Mul(SB), NOSPLIT, $0-24 | |||
MOVD z+0(FP), R2 | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
// Load xL in R3-R6, xH in R7-R10 | |||
// (xH + xL) in R25-R29 | |||
LDP 0(R0), (R3, R4) | |||
LDP 32(R0), (R7, R8) | |||
ADDS R3, R7, R25 | |||
ADCS R4, R8, R26 | |||
LDP 16(R0), (R5, R6) | |||
LDP 48(R0), (R9, R10) | |||
ADCS R5, R9, R27 | |||
ADCS R6, R10, R29 | |||
ADC ZR, ZR, R7 | |||
// Load yL in R11-R14, yH in R15-19 | |||
// (yH + yL) in R11-R14, destroys yL | |||
LDP 0(R1), (R11, R12) | |||
LDP 32(R1), (R15, R16) | |||
ADDS R15, R11 | |||
ADCS R16, R12 | |||
LDP 16(R1), (R13, R14) | |||
LDP 48(R1), (R17, R19) | |||
ADCS R17, R13 | |||
ADCS R19, R14 | |||
ADC ZR, ZR, R8 | |||
// Compute maskes and combined carry | |||
SUB R7, ZR, R9 | |||
SUB R8, ZR, R10 | |||
AND R8, R7 | |||
// masked(yH + yL) | |||
AND R9, R11, R15 | |||
AND R9, R12, R16 | |||
AND R9, R13, R17 | |||
AND R9, R14, R19 | |||
// masked(xH + xL) | |||
AND R10, R25, R20 | |||
AND R10, R26, R21 | |||
AND R10, R27, R22 | |||
AND R10, R29, R23 | |||
// masked(xH + xL) + masked(yH + yL) in R15-R19 | |||
ADDS R20, R15 | |||
ADCS R21, R16 | |||
ADCS R22, R17 | |||
ADCS R23, R19 | |||
ADC ZR, R7 | |||
// Use z as temporary storage | |||
STP (R25, R26), 0(R2) | |||
// (xH + xL) * (yH + yL) | |||
mul256x256karatsuba(0(R2), R25, R26, R27, R29, R11, R12, R13, R14, R8, R9, R10, R20, R21, R22, R23, R24, R0, R1) | |||
MOVD x+8(FP), R0 | |||
MOVD y+16(FP), R1 | |||
ADDS R21, R15 | |||
ADCS R22, R16 | |||
ADCS R23, R17 | |||
ADCS R24, R19 | |||
ADC ZR, R7 | |||
// Load yL in R11-R14 | |||
LDP 0(R1), (R11, R12) | |||
LDP 16(R1), (R13, R14) | |||
// xL * yL | |||
mul256x256karatsuba(0(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R1, R2) | |||
MOVD z+0(FP), R2 | |||
MOVD y+16(FP), R1 | |||
// (xH + xL) * (yH + yL) - xL * yL | |||
SUBS R21, R8 | |||
SBCS R22, R9 | |||
STP (R21, R22), 0(R2) | |||
SBCS R23, R10 | |||
SBCS R24, R20 | |||
STP (R23, R24), 16(R2) | |||
SBCS R25, R15 | |||
SBCS R26, R16 | |||
SBCS R27, R17 | |||
SBCS R29, R19 | |||
SBC ZR, R7 | |||
// Load xH in R3-R6, yH in R11-R14 | |||
LDP 32(R0), (R3, R4) | |||
LDP 48(R0), (R5, R6) | |||
LDP 32(R1), (R11, R12) | |||
LDP 48(R1), (R13, R14) | |||
ADDS R25, R8 | |||
ADCS R26, R9 | |||
ADCS R27, R10 | |||
ADCS R29, R20 | |||
ADC ZR, ZR, R1 | |||
MOVD R20, 32(R2) | |||
// xH * yH | |||
mul256x256karatsuba(32(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R2, R20) | |||
NEG R1, R1 | |||
MOVD z+0(FP), R2 | |||
MOVD 32(R2), R20 | |||
// (xH + xL) * (yH + yL) - xL * yL - xH * yH in R8-R10,R20,R15-R19 | |||
// Store lower half in z, that's done | |||
SUBS R21, R8 | |||
SBCS R22, R9 | |||
STP (R8, R9), 32(R2) | |||
SBCS R23, R10 | |||
SBCS R24, R20 | |||
STP (R10, R20), 48(R2) | |||
SBCS R25, R15 | |||
SBCS R26, R16 | |||
SBCS R27, R17 | |||
SBCS R29, R19 | |||
SBC ZR, R7 | |||
// (xH * yH) * 2^512 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^256 + xL * yL | |||
// Store remaining limbs in z | |||
ADDS $1, R1 | |||
ADCS R21, R15 | |||
ADCS R22, R16 | |||
STP (R15, R16), 64(R2) | |||
ADCS R23, R17 | |||
ADCS R24, R19 | |||
STP (R17, R19), 80(R2) | |||
ADCS R7, R25 | |||
ADCS ZR, R26 | |||
STP (R25, R26), 96(R2) | |||
ADCS ZR, R27 | |||
ADC ZR, R29 | |||
STP (R27, R29), 112(R2) | |||
RET | |||
// Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) | |||
// Z0 is not actually touched | |||
// Result of (X0-X1) * (Y0-Y3) will be in Z0-Z5 | |||
// Inputs remain intact | |||
#define mul128x256comba(X0, X1, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3)\ | |||
MUL X1, Y0, T0 \ | |||
UMULH X1, Y0, T1 \ | |||
ADDS Z3, Z1 \ | |||
ADC ZR, Z2 \ | |||
\ | |||
MUL X0, Y2, T2 \ | |||
UMULH X0, Y2, T3 \ | |||
ADDS T0, Z1 \ | |||
ADCS T1, Z2 \ | |||
ADC ZR, ZR, Z3 \ | |||
\ | |||
MUL X1, Y1, T0 \ | |||
UMULH X1, Y1, T1 \ | |||
ADDS T2, Z2 \ | |||
ADCS T3, Z3 \ | |||
ADC ZR, ZR, Z4 \ | |||
\ | |||
MUL X0, Y3, T2 \ | |||
UMULH X0, Y3, T3 \ | |||
ADDS T0, Z2 \ | |||
ADCS T1, Z3 \ | |||
ADC ZR, Z4 \ | |||
\ | |||
MUL X1, Y2, T0 \ | |||
UMULH X1, Y2, T1 \ | |||
ADDS T2, Z3 \ | |||
ADCS T3, Z4 \ | |||
ADC ZR, ZR, Z5 \ | |||
\ | |||
MUL X1, Y3, T2 \ | |||
UMULH X1, Y3, T3 \ | |||
ADDS T0, Z3 \ | |||
ADCS T1, Z4 \ | |||
ADC ZR, Z5 \ | |||
ADDS T2, Z4 \ | |||
ADC T3, Z5 | |||
// This implements the shifted 2^(B*w) Montgomery reduction from | |||
// https://eprint.iacr.org/2016/986.pdf, section Section 3.2, with | |||
// B = 4, w = 64. Performance results were reported in | |||
// https://eprint.iacr.org/2018/700.pdf Section 6. | |||
TEXT ·fp503MontgomeryReduce(SB), NOSPLIT, $0-16 | |||
MOVD x+8(FP), R0 | |||
// Load x0-x1 | |||
LDP 0(R0), (R2, R3) | |||
// Load the prime constant in R25-R29 | |||
LDP ·p503p1s8+32(SB), (R25, R26) | |||
LDP ·p503p1s8+48(SB), (R27, R29) | |||
// [x0,x1] * p503p1s8 to R4-R9 | |||
MUL R2, R25, R4 // x0 * p503p1s8[0] | |||
UMULH R2, R25, R7 | |||
MUL R2, R26, R5 // x0 * p503p1s8[1] | |||
UMULH R2, R26, R6 | |||
mul128x256comba(R2, R3, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13) | |||
LDP 16(R0), (R3, R11) // x2 | |||
LDP 32(R0), (R12, R13) | |||
LDP 48(R0), (R14, R15) | |||
// Left-shift result in R4-R9 by 56 to R4-R10 | |||
ORR R9>>8, ZR, R10 | |||
LSL $56, R9 | |||
ORR R8>>8, R9 | |||
LSL $56, R8 | |||
ORR R7>>8, R8 | |||
LSL $56, R7 | |||
ORR R6>>8, R7 | |||
LSL $56, R6 | |||
ORR R5>>8, R6 | |||
LSL $56, R5 | |||
ORR R4>>8, R5 | |||
LSL $56, R4 | |||
ADDS R4, R11 // x3 | |||
ADCS R5, R12 // x4 | |||
ADCS R6, R13 | |||
ADCS R7, R14 | |||
ADCS R8, R15 | |||
LDP 64(R0), (R16, R17) | |||
LDP 80(R0), (R19, R20) | |||
MUL R3, R25, R4 // x2 * p503p1s8[0] | |||
UMULH R3, R25, R7 | |||
ADCS R9, R16 | |||
ADCS R10, R17 | |||
ADCS ZR, R19 | |||
ADCS ZR, R20 | |||
LDP 96(R0), (R21, R22) | |||
LDP 112(R0), (R23, R24) | |||
MUL R3, R26, R5 // x2 * p503p1s8[1] | |||
UMULH R3, R26, R6 | |||
ADCS ZR, R21 | |||
ADCS ZR, R22 | |||
ADCS ZR, R23 | |||
ADC ZR, R24 | |||
// [x2,x3] * p503p1s8 to R4-R9 | |||
mul128x256comba(R3, R11, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) | |||
ORR R9>>8, ZR, R10 | |||
LSL $56, R9 | |||
ORR R8>>8, R9 | |||
LSL $56, R8 | |||
ORR R7>>8, R8 | |||
LSL $56, R7 | |||
ORR R6>>8, R7 | |||
LSL $56, R6 | |||
ORR R5>>8, R6 | |||
LSL $56, R5 | |||
ORR R4>>8, R5 | |||
LSL $56, R4 | |||
ADDS R4, R13 // x5 | |||
ADCS R5, R14 // x6 | |||
ADCS R6, R15 | |||
ADCS R7, R16 | |||
MUL R12, R25, R4 // x4 * p503p1s8[0] | |||
UMULH R12, R25, R7 | |||
ADCS R8, R17 | |||
ADCS R9, R19 | |||
ADCS R10, R20 | |||
ADCS ZR, R21 | |||
MUL R12, R26, R5 // x4 * p503p1s8[1] | |||
UMULH R12, R26, R6 | |||
ADCS ZR, R22 | |||
ADCS ZR, R23 | |||
ADC ZR, R24 | |||
// [x4,x5] * p503p1s8 to R4-R9 | |||
mul128x256comba(R12, R13, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) | |||
ORR R9>>8, ZR, R10 | |||
LSL $56, R9 | |||
ORR R8>>8, R9 | |||
LSL $56, R8 | |||
ORR R7>>8, R8 | |||
LSL $56, R7 | |||
ORR R6>>8, R7 | |||
LSL $56, R6 | |||
ORR R5>>8, R6 | |||
LSL $56, R5 | |||
ORR R4>>8, R5 | |||
LSL $56, R4 | |||
ADDS R4, R15 // x7 | |||
ADCS R5, R16 // x8 | |||
ADCS R6, R17 | |||
ADCS R7, R19 | |||
MUL R14, R25, R4 // x6 * p503p1s8[0] | |||
UMULH R14, R25, R7 | |||
ADCS R8, R20 | |||
ADCS R9, R21 | |||
ADCS R10, R22 | |||
MUL R14, R26, R5 // x6 * p503p1s8[1] | |||
UMULH R14, R26, R6 | |||
ADCS ZR, R23 | |||
ADC ZR, R24 | |||
// [x6,x7] * p503p1s8 to R4-R9 | |||
mul128x256comba(R14, R15, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) | |||
ORR R9>>8, ZR, R10 | |||
LSL $56, R9 | |||
ORR R8>>8, R9 | |||
LSL $56, R8 | |||
ORR R7>>8, R8 | |||
LSL $56, R7 | |||
ORR R6>>8, R7 | |||
LSL $56, R6 | |||
ORR R5>>8, R6 | |||
LSL $56, R5 | |||
ORR R4>>8, R5 | |||
LSL $56, R4 | |||
MOVD z+0(FP), R0 | |||
ADDS R4, R17 | |||
ADCS R5, R19 | |||
STP (R16, R17), 0(R0) // Store final result to z | |||
ADCS R6, R20 | |||
ADCS R7, R21 | |||
STP (R19, R20), 16(R0) | |||
ADCS R8, R22 | |||
ADCS R9, R23 | |||
STP (R21, R22), 32(R0) | |||
ADC R10, R24 | |||
STP (R23, R24), 48(R0) | |||
RET | |||
TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8 | |||
MOVD x+0(FP), R0 | |||
// Keep x in R1-R8, p503 in R9-R14, subtract to R1-R8 | |||
LDP ·p503+16(SB), (R9, R10) | |||
LDP 0(R0), (R1, R2) | |||
LDP 16(R0), (R3, R4) | |||
SUBS R9, R1 | |||
SBCS R9, R2 | |||
LDP 32(R0), (R5, R6) | |||
LDP ·p503+32(SB), (R11, R12) | |||
SBCS R9, R3 | |||
SBCS R10, R4 | |||
LDP 48(R0), (R7, R8) | |||
LDP ·p503+48(SB), (R13, R14) | |||
SBCS R11, R5 | |||
SBCS R12, R6 | |||
SBCS R13, R7 | |||
SBCS R14, R8 | |||
SBC ZR, ZR, R15 | |||
// Mask with the borrow and add p503 | |||
AND R15, R9 | |||
AND R15, R10 | |||
AND R15, R11 | |||
AND R15, R12 | |||
AND R15, R13 | |||
AND R15, R14 | |||
ADDS R9, R1 | |||
ADCS R9, R2 | |||
STP (R1, R2), 0(R0) | |||
ADCS R9, R3 | |||
ADCS R10, R4 | |||
STP (R3, R4), 16(R0) | |||
ADCS R11, R5 | |||
ADCS R12, R6 | |||
STP (R5, R6), 32(R0) | |||
ADCS R13, R7 | |||
ADCS R14, R8 | |||
STP (R7, R8), 48(R0) | |||
RET |
@@ -1,12 +1,9 @@ | |||
// +build amd64,!noasm | |||
// +build amd64,!noasm arm64,!noasm | |||
package p503 | |||
import ( | |||
. "github.com/cloudflare/sidh/internal/isogeny" | |||
// This is imported only because arith_amd64.s needs | |||
// some symbols from cpuid.go | |||
_ "github.com/cloudflare/sidh/internal/utils" | |||
) | |||
// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. | |||
@@ -1,4 +1,4 @@ | |||
// +build noasm !amd64 | |||
// +build noasm !amd64,!arm64 | |||
package p503 | |||
@@ -145,6 +145,12 @@ var p503R2 = FpElement{ | |||
0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771, | |||
} | |||
// p503 + 1 left-shifted by 8, assuming little endianness | |||
var p503p1s8 = FpElement{ | |||
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, | |||
0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13, 0x45C6BDDA77A4D01B, 0x4066F541811E1E60, | |||
} | |||
// 1*R mod p | |||
var P503_OneFp2 = Fp2Element{ | |||
A: FpElement{ | |||
@@ -0,0 +1,16 @@ | |||
// +build amd64,!noasm | |||
package p751 | |||
import cpu "github.com/cloudflare/sidh/internal/utils" | |||
// Signals support for MULX which is in BMI2 | |||
var HasBMI2 bool | |||
// Signals support for ADX and BMI2 | |||
var HasADXandBMI2 bool | |||
func init() { | |||
HasBMI2 = cpu.HasBMI2 | |||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||
} |
@@ -1740,9 +1740,9 @@ TEXT ·fp751MontgomeryReduce(SB), $0-16 | |||
MOVQ x+8(FP), REG_P1 | |||
// Check wether to use optimized implementation | |||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | |||
CMPB ·HasADXandBMI2(SB), $1 | |||
JE redc_with_mulx_adcx_adox | |||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | |||
CMPB ·HasBMI2(SB), $1 | |||
JE redc_with_mulx | |||
MOVQ (REG_P1), R11 | |||
@@ -22,22 +22,27 @@ const ( | |||
kUse_MULXandADxX = 1 << 2 | |||
) | |||
func resetCpuFeatures() { | |||
HasBMI2 = cpu.HasBMI2 | |||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||
} | |||
// Utility function used for testing REDC implementations. Tests caller provided | |||
// redcFunc against redc() | |||
func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||
doRedcTest := func(aRR FpElementX2) bool { | |||
defer cpu.RecognizeCpu() | |||
defer resetCpuFeatures() | |||
var resRedcF1, resRedcF2 FpElement | |||
var aRRcpy = aRR | |||
// Compute redc with first implementation | |||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||
HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||
HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||
fp751MontgomeryReduce(&resRedcF1, &aRR) | |||
// Compute redc with second implementation | |||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||
HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||
HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||
fp751MontgomeryReduce(&resRedcF2, &aRRcpy) | |||
// Compare results | |||
@@ -51,8 +56,8 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||
func TestRedcWithMULX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasBMI2 { | |||
t.Skip("MULX not supported by the platform") | |||
} | |||
testRedc(t, kUse_MULX, kUse_MUL) | |||
@@ -61,8 +66,8 @@ func TestRedcWithMULX(t *testing.T) { | |||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||
// and ADCX/ADOX. | |||
func TestRedcWithMULXADxX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasADXandBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasADXandBMI2 { | |||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | |||
} | |||
testRedc(t, kUse_MULXandADxX, kUse_MUL) | |||
@@ -71,8 +76,8 @@ func TestRedcWithMULXADxX(t *testing.T) { | |||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||
// and ADCX/ADOX. | |||
func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | |||
defer cpu.RecognizeCpu() | |||
if !cpu.HasADXandBMI2 { | |||
defer resetCpuFeatures() | |||
if !HasADXandBMI2 { | |||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | |||
} | |||
testRedc(t, kUse_MULXandADxX, kUse_MULX) | |||
@@ -1,4 +1,4 @@ | |||
// +build amd64,!noasm | |||
// +build amd64,!noasm arm64,!noasm | |||
package p751 | |||
@@ -39,9 +39,7 @@ func fp751X2SubLazy(z, x, y *FpElementX2) | |||
//go:noescape | |||
func fp751Mul(z *FpElementX2, x, y *FpElement) | |||
// Function pointer that should point to one of the | |||
// fp751MontgomeryReduce implementations below. | |||
// When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p). | |||
// Compute Montgomery reduction: set z = x * R^{-1} (mod 2*p). | |||
// It may destroy the input value. | |||
//go:noescape | |||
func fp751MontgomeryReduce(z *FpElement, x *FpElementX2) | |||
@@ -1,4 +1,4 @@ | |||
// +build noasm !amd64 | |||
// +build noasm !amd64,!arm64 | |||
package p751 | |||