Author | SHA1 | Message | Date |
---|---|---|---|
Henry Case | a6e152eaec |
fix: fixes vendoring issue
Reported: https://github.com/cloudflare/sidh/issues/21 See also: https://github.com/golang/go/issues/28230 |
6 years ago |
Henry Case | 137b47345f | makefile: when vendoring consider only .s and .go files | 6 years ago |
Henry Case | 7b83e32ecd | vendor: when vendoring also U+00B7 needs to be changed to _ | 6 years ago |
Henry Case | 5af598dcb3 | sidh: improve comment for ARM's redc implementation | 6 years ago |
Ko- | 67c90d155e |
Add field arithmetic in assembly for arm64 for p503 and p751 (#10)
sidh: ARM64 implementation * p503 and p751 implementation with two level Karatsuba Benchmarks: benchmark old ns/op new ns/op delta BenchmarkFp2ElementMul 5627 387 -93.12% BenchmarkFp2ElementInv 1272612 80818 -93.65% BenchmarkFp2ElementSquare 4168 314 -92.47% BenchmarkFp2ElementAdd 141 46.2 -67.23% BenchmarkFp2ElementSub 107 40.9 -61.78% BenchmarkPrimeFieldElementMul 2081 125 -93.99% BenchmarkFp503Multiply 1218 63.0 -94.83% BenchmarkFp503MontgomeryReduce 856 51.6 -93.97% BenchmarkFp503AddReduced 59.1 14.6 -75.30% BenchmarkFp503SubReduced 42.9 10.6 -75.29% BenchmarkFp503ConditionalSwap 37.3 18.1 -51.47% BenchmarkFp503StrongReduce 44.2 9.66 -78.14% BenchmarkFp503AddLazy 20.8 8.40 -59.62% BenchmarkFp503X2AddLazy 36.5 12.5 -65.75% BenchmarkFp503X2SubLazy 67.5 13.8 -79.56% BenchmarkThreePointLadder255BitScalar 16388474 1443781 -91.19% BenchmarkFp2ElementMul 12046 698 -94.21% BenchmarkFp2ElementInv 3927356 220699 -94.38% BenchmarkFp2ElementSquare 8992 559 -93.78% BenchmarkFp2ElementAdd 193 63.2 -67.25% BenchmarkFp2ElementSub 143 49.1 -65.66% BenchmarkPrimeFieldElementMul 4476 236 -94.73% BenchmarkFp751Multiply 2740 125 -95.44% BenchmarkFp751MontgomeryReduce 1738 101 -94.19% BenchmarkFp751AddReduced 85.2 19.9 -76.64% BenchmarkFp751SubReduced 62.0 17.2 -72.26% BenchmarkFp751ConditionalSwap 52.6 22.2 -57.79% BenchmarkFp751StrongReduce 61.9 13.3 -78.51% BenchmarkFp751AddLazy 28.7 10.9 -62.02% BenchmarkFp751X2AddLazy 52.2 15.8 -69.73% BenchmarkFp751X2SubLazy 97.6 20.6 -78.89% BenchmarkThreePointLadder379BitScalar 52570696 3571511 -93.21% BenchmarkR2L379BitScalar 52421015 3571497 -93.19% BenchmarkSidhKeyAgreementP751 482289441 32306099 -93.30% BenchmarkSidhKeyAgreementP503 144369772 12319167 -91.47% BenchmarkAliceKeyGenPrvP751 1428 1419 -0.63% BenchmarkAliceKeyGenPrvP503 1214 1200 -1.15% BenchmarkBobKeyGenPrvP751 1433 1428 -0.35% BenchmarkBobKeyGenPrvP503 1215 1204 -0.91% BenchmarkAliceKeyGenPubP751 270559528 17921608 -93.38% BenchmarkAliceKeyGenPubP503 82615895 6880817 -91.67% BenchmarkBobKeyGenPubP751 303556706 20901082 -93.11% BenchmarkBobKeyGenPubP503 91116505 8152519 -91.05% BenchmarkSharedSecretAliceP751 223935926 14567056 -93.49% BenchmarkSharedSecretAliceP503 67340445 5518566 -91.80% BenchmarkSharedSecretBobP751 259148479 17659648 -93.19% BenchmarkSharedSecretBobP503 76827887 6766748 -91.19% |
6 years ago |
@@ -43,11 +43,11 @@ copy-target-%: | |||||
prep_targets: build_env $(addprefix copy-target-, $(TARGETS)) | prep_targets: build_env $(addprefix copy-target-, $(TARGETS)) | ||||
install-%: prep_targets | install-%: prep_targets | ||||
GOPATH=$(GOPATH_LOCAL) $(GO) install $(OPTS) $(GOPATH_DIR)/$* | |||||
GOPATH=$(GOPATH_LOCAL) GOARCH=$(GOARCH) $(GO) install $(OPTS) $(GOPATH_DIR)/$* | |||||
test-%: prep_targets | test-%: prep_targets | ||||
GOPATH=$(GOPATH_LOCAL) $(GO) vet $(GOPATH_DIR)/$* | GOPATH=$(GOPATH_LOCAL) $(GO) vet $(GOPATH_DIR)/$* | ||||
GOPATH=$(GOPATH_LOCAL) $(GO) test $(OPTS) $(GOPATH_DIR)/$* | |||||
GOPATH=$(GOPATH_LOCAL) GOARCH=$(GOARCH) $(GO) test $(OPTS) $(GOPATH_DIR)/$* | |||||
bench-%: prep_targets | bench-%: prep_targets | ||||
GOMAXPROCS=1 GOPATH=$(GOPATH_LOCAL) $(GO) test $(OPTS) $(GOPATH_DIR)/$* $(BENCH_OPTS) | GOMAXPROCS=1 GOPATH=$(GOPATH_LOCAL) $(GO) test $(OPTS) $(GOPATH_DIR)/$* $(BENCH_OPTS) | ||||
@@ -67,8 +67,9 @@ vendor: clean | |||||
--exclude=README.md \ | --exclude=README.md \ | ||||
--exclude=Makefile \ | --exclude=Makefile \ | ||||
--exclude=build | --exclude=build | ||||
find $(VENDOR_DIR) -type f -print0 -name "*.go" | xargs -0 sed -i 's/github\.com/github_com/g' | |||||
# This swaps all imports with github.com to github_com, so that standard library doesn't | |||||
# try to access external libraries. | |||||
find $(VENDOR_DIR) -type f -iname "*.go" -print0 | xargs -0 sed -i 's/github\.com/github_com/g' | |||||
bench: $(addprefix bench-, $(TARGETS)) | bench: $(addprefix bench-, $(TARGETS)) | ||||
cover: $(addprefix cover-, $(TARGETS)) | cover: $(addprefix cover-, $(TARGETS)) | ||||
@@ -1,4 +1,4 @@ | |||||
// +build amd64, !noasm | |||||
// +build amd64,!noasm | |||||
// Sets capabilities flags for x86 according to information received from | // Sets capabilities flags for x86 according to information received from | ||||
// CPUID. It was written in accordance with | // CPUID. It was written in accordance with | ||||
@@ -10,8 +10,8 @@ package utils | |||||
// Signals support for MULX which is in BMI2 | // Signals support for MULX which is in BMI2 | ||||
var HasBMI2 bool | var HasBMI2 bool | ||||
// Signals support for ADX and BMI2 | |||||
var HasADXandBMI2 bool | |||||
// Signals support for ADX | |||||
var HasADX bool | |||||
// Performs CPUID and returns values of registers | // Performs CPUID and returns values of registers | ||||
// go:nosplit | // go:nosplit | ||||
@@ -31,7 +31,7 @@ func RecognizeCpu() { | |||||
_, ebx, _, _ := cpuid(7, 0) | _, ebx, _, _ := cpuid(7, 0) | ||||
HasBMI2 = bitn(ebx, 19) | HasBMI2 = bitn(ebx, 19) | ||||
HasADXandBMI2 = bitn(ebx, 7) && HasBMI2 | |||||
HasADX = bitn(ebx, 7) | |||||
} | } | ||||
func init() { | func init() { | ||||
@@ -0,0 +1,16 @@ | |||||
// +build amd64 | |||||
package p503 | |||||
import cpu "github.com/cloudflare/sidh/internal/utils" | |||||
// Signals support for MULX which is in BMI2 | |||||
var HasBMI2 bool | |||||
// Signals support for ADX and BMI2 | |||||
var HasADXandBMI2 bool | |||||
func init() { | |||||
HasBMI2 = cpu.HasBMI2 | |||||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||||
} |
@@ -697,9 +697,9 @@ TEXT ·fp503Mul(SB), NOSPLIT, $104-24 | |||||
MOVQ y+16(FP), REG_P2 | MOVQ y+16(FP), REG_P2 | ||||
// Check wether to use optimized implementation | // Check wether to use optimized implementation | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | |||||
CMPB ·HasADXandBMI2(SB), $1 | |||||
JE mul_with_mulx_adcx_adox | JE mul_with_mulx_adcx_adox | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | |||||
CMPB ·HasBMI2(SB), $1 | |||||
JE mul_with_mulx | JE mul_with_mulx | ||||
// Generic x86 implementation (below) uses variant of Karatsuba method. | // Generic x86 implementation (below) uses variant of Karatsuba method. | ||||
@@ -1194,9 +1194,9 @@ TEXT ·fp503MontgomeryReduce(SB), $0-16 | |||||
MOVQ x+8(FP), REG_P1 | MOVQ x+8(FP), REG_P1 | ||||
// Check wether to use optimized implementation | // Check wether to use optimized implementation | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | |||||
CMPB ·HasADXandBMI2(SB), $1 | |||||
JE redc_with_mulx_adcx_adox | JE redc_with_mulx_adcx_adox | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | |||||
CMPB ·HasBMI2(SB), $1 | |||||
JE redc_with_mulx | JE redc_with_mulx | ||||
MOVQ (REG_P1), R11 | MOVQ (REG_P1), R11 | ||||
@@ -21,21 +21,26 @@ const ( | |||||
kUse_MULXandADxX = 1 << 2 | kUse_MULXandADxX = 1 << 2 | ||||
) | ) | ||||
func resetCpuFeatures() { | |||||
HasBMI2 = cpu.HasBMI2 | |||||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||||
} | |||||
// Utility function used for testing Mul implementations. Tests caller provided | // Utility function used for testing Mul implementations. Tests caller provided | ||||
// mulFunc against mul() | // mulFunc against mul() | ||||
func testMul(t *testing.T, f1, f2 OptimFlag) { | func testMul(t *testing.T, f1, f2 OptimFlag) { | ||||
doMulTest := func(multiplier, multiplicant FpElement) bool { | doMulTest := func(multiplier, multiplicant FpElement) bool { | ||||
defer cpu.RecognizeCpu() | |||||
defer resetCpuFeatures() | |||||
var resMulRef, resMulOptim FpElementX2 | var resMulRef, resMulOptim FpElementX2 | ||||
// Compute multiplier*multiplicant with first implementation | // Compute multiplier*multiplicant with first implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||||
HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
fp503Mul(&resMulOptim, &multiplier, &multiplicant) | fp503Mul(&resMulOptim, &multiplier, &multiplicant) | ||||
// Compute multiplier*multiplicant with second implementation | // Compute multiplier*multiplicant with second implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||||
HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
fp503Mul(&resMulRef, &multiplier, &multiplicant) | fp503Mul(&resMulRef, &multiplier, &multiplicant) | ||||
// Compare results | // Compare results | ||||
@@ -51,18 +56,18 @@ func testMul(t *testing.T, f1, f2 OptimFlag) { | |||||
// redcFunc against redc() | // redcFunc against redc() | ||||
func testRedc(t *testing.T, f1, f2 OptimFlag) { | func testRedc(t *testing.T, f1, f2 OptimFlag) { | ||||
doRedcTest := func(aRR FpElementX2) bool { | doRedcTest := func(aRR FpElementX2) bool { | ||||
defer cpu.RecognizeCpu() | |||||
defer resetCpuFeatures() | |||||
var resRedcF1, resRedcF2 FpElement | var resRedcF1, resRedcF2 FpElement | ||||
var aRRcpy = aRR | var aRRcpy = aRR | ||||
// Compute redc with first implementation | // Compute redc with first implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||||
HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
fp503MontgomeryReduce(&resRedcF1, &aRR) | fp503MontgomeryReduce(&resRedcF1, &aRR) | ||||
// Compute redc with second implementation | // Compute redc with second implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||||
HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
fp503MontgomeryReduce(&resRedcF2, &aRRcpy) | fp503MontgomeryReduce(&resRedcF2, &aRRcpy) | ||||
// Compare results | // Compare results | ||||
@@ -76,8 +81,8 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||||
// Ensures correctness of implementation of mul operation which uses MULX | // Ensures correctness of implementation of mul operation which uses MULX | ||||
func TestMulWithMULX(t *testing.T) { | func TestMulWithMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasBMI2 { | |||||
t.Skip("MULX not supported by the platform") | t.Skip("MULX not supported by the platform") | ||||
} | } | ||||
testMul(t, kUse_MULX, kUse_MUL) | testMul(t, kUse_MULX, kUse_MUL) | ||||
@@ -85,8 +90,8 @@ func TestMulWithMULX(t *testing.T) { | |||||
// Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | // Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | ||||
func TestMulWithMULXADxX(t *testing.T) { | func TestMulWithMULXADxX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasADXandBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasADXandBMI2 { | |||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testMul(t, kUse_MULXandADxX, kUse_MUL) | testMul(t, kUse_MULXandADxX, kUse_MUL) | ||||
@@ -94,8 +99,8 @@ func TestMulWithMULXADxX(t *testing.T) { | |||||
// Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | // Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | ||||
func TestMulWithMULXADxXAgainstMULX(t *testing.T) { | func TestMulWithMULXADxXAgainstMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasADXandBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasADXandBMI2 { | |||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testMul(t, kUse_MULX, kUse_MULXandADxX) | testMul(t, kUse_MULX, kUse_MULXandADxX) | ||||
@@ -103,8 +108,8 @@ func TestMulWithMULXADxXAgainstMULX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | // Ensures correctness of Montgomery reduction implementation which uses MULX | ||||
func TestRedcWithMULX(t *testing.T) { | func TestRedcWithMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasBMI2 { | |||||
t.Skip("MULX not supported by the platform") | t.Skip("MULX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_MULX, kUse_MUL) | testRedc(t, kUse_MULX, kUse_MUL) | ||||
@@ -113,8 +118,8 @@ func TestRedcWithMULX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | // Ensures correctness of Montgomery reduction implementation which uses MULX | ||||
// and ADCX/ADOX. | // and ADCX/ADOX. | ||||
func TestRedcWithMULXADxX(t *testing.T) { | func TestRedcWithMULXADxX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasADXandBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasADXandBMI2 { | |||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_MULXandADxX, kUse_MUL) | testRedc(t, kUse_MULXandADxX, kUse_MUL) | ||||
@@ -123,8 +128,8 @@ func TestRedcWithMULXADxX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | // Ensures correctness of Montgomery reduction implementation which uses MULX | ||||
// and ADCX/ADOX. | // and ADCX/ADOX. | ||||
func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasADXandBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasADXandBMI2 { | |||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_MULXandADxX, kUse_MULX) | testRedc(t, kUse_MULXandADxX, kUse_MULX) | ||||
@@ -0,0 +1,802 @@ | |||||
// +build arm64,!noasm | |||||
#include "textflag.h" | |||||
TEXT ·fp503ConditionalSwap(SB), NOSPLIT, $0-17 | |||||
MOVD x+0(FP), R0 | |||||
MOVD y+8(FP), R1 | |||||
MOVB choice+16(FP), R2 | |||||
// Set flags | |||||
// If choice is not 0 or 1, this implementation will swap completely | |||||
CMP $0, R2 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 0(R1), (R5, R6) | |||||
CSEL EQ, R3, R5, R7 | |||||
CSEL EQ, R4, R6, R8 | |||||
STP (R7, R8), 0(R0) | |||||
CSEL NE, R3, R5, R9 | |||||
CSEL NE, R4, R6, R10 | |||||
STP (R9, R10), 0(R1) | |||||
LDP 16(R0), (R3, R4) | |||||
LDP 16(R1), (R5, R6) | |||||
CSEL EQ, R3, R5, R7 | |||||
CSEL EQ, R4, R6, R8 | |||||
STP (R7, R8), 16(R0) | |||||
CSEL NE, R3, R5, R9 | |||||
CSEL NE, R4, R6, R10 | |||||
STP (R9, R10), 16(R1) | |||||
LDP 32(R0), (R3, R4) | |||||
LDP 32(R1), (R5, R6) | |||||
CSEL EQ, R3, R5, R7 | |||||
CSEL EQ, R4, R6, R8 | |||||
STP (R7, R8), 32(R0) | |||||
CSEL NE, R3, R5, R9 | |||||
CSEL NE, R4, R6, R10 | |||||
STP (R9, R10), 32(R1) | |||||
LDP 48(R0), (R3, R4) | |||||
LDP 48(R1), (R5, R6) | |||||
CSEL EQ, R3, R5, R7 | |||||
CSEL EQ, R4, R6, R8 | |||||
STP (R7, R8), 48(R0) | |||||
CSEL NE, R3, R5, R9 | |||||
CSEL NE, R4, R6, R10 | |||||
STP (R9, R10), 48(R1) | |||||
RET | |||||
TEXT ·fp503AddReduced(SB), NOSPLIT, $0-24 | |||||
MOVD z+0(FP), R2 | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
// Load first summand into R3-R10 | |||||
// Add first summand and second summand and store result in R3-R10 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 16(R0), (R5, R6) | |||||
LDP 16(R1), (R13, R14) | |||||
ADDS R11, R3 | |||||
ADCS R12, R4 | |||||
ADCS R13, R5 | |||||
ADCS R14, R6 | |||||
LDP 32(R0), (R7, R8) | |||||
LDP 32(R1), (R11, R12) | |||||
LDP 48(R0), (R9, R10) | |||||
LDP 48(R1), (R13, R14) | |||||
ADCS R11, R7 | |||||
ADCS R12, R8 | |||||
ADCS R13, R9 | |||||
ADC R14, R10 | |||||
// Subtract 2 * p503 in R11-R17 from the result in R3-R10 | |||||
LDP ·p503x2+0(SB), (R11, R12) | |||||
LDP ·p503x2+24(SB), (R13, R14) | |||||
SUBS R11, R3 | |||||
SBCS R12, R4 | |||||
LDP ·p503x2+40(SB), (R15, R16) | |||||
SBCS R12, R5 | |||||
SBCS R13, R6 | |||||
MOVD ·p503x2+56(SB), R17 | |||||
SBCS R14, R7 | |||||
SBCS R15, R8 | |||||
SBCS R16, R9 | |||||
SBCS R17, R10 | |||||
SBC ZR, ZR, R19 | |||||
// If x + y - 2 * p503 < 0, R19 is 1 and 2 * p503 should be added | |||||
AND R19, R11 | |||||
AND R19, R12 | |||||
AND R19, R13 | |||||
AND R19, R14 | |||||
AND R19, R15 | |||||
AND R19, R16 | |||||
AND R19, R17 | |||||
ADDS R11, R3 | |||||
ADCS R12, R4 | |||||
STP (R3, R4), 0(R2) | |||||
ADCS R12, R5 | |||||
ADCS R13, R6 | |||||
STP (R5, R6), 16(R2) | |||||
ADCS R14, R7 | |||||
ADCS R15, R8 | |||||
STP (R7, R8), 32(R2) | |||||
ADCS R16, R9 | |||||
ADC R17, R10 | |||||
STP (R9, R10), 48(R2) | |||||
RET | |||||
TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24 | |||||
MOVD z+0(FP), R2 | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
// Load x into R3-R10 | |||||
// Subtract y from x and store result in R3-R10 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 16(R0), (R5, R6) | |||||
LDP 16(R1), (R13, R14) | |||||
SUBS R11, R3 | |||||
SBCS R12, R4 | |||||
SBCS R13, R5 | |||||
SBCS R14, R6 | |||||
LDP 32(R0), (R7, R8) | |||||
LDP 32(R1), (R11, R12) | |||||
LDP 48(R0), (R9, R10) | |||||
LDP 48(R1), (R13, R14) | |||||
SBCS R11, R7 | |||||
SBCS R12, R8 | |||||
SBCS R13, R9 | |||||
SBCS R14, R10 | |||||
SBC ZR, ZR, R19 | |||||
// If x - y < 0, R19 is 1 and 2 * p503 should be added | |||||
LDP ·p503x2+0(SB), (R11, R12) | |||||
LDP ·p503x2+24(SB), (R13, R14) | |||||
AND R19, R11 | |||||
AND R19, R12 | |||||
LDP ·p503x2+40(SB), (R15, R16) | |||||
AND R19, R13 | |||||
AND R19, R14 | |||||
MOVD ·p503x2+56(SB), R17 | |||||
AND R19, R15 | |||||
AND R19, R16 | |||||
AND R19, R17 | |||||
ADDS R11, R3 | |||||
ADCS R12, R4 | |||||
STP (R3, R4), 0(R2) | |||||
ADCS R12, R5 | |||||
ADCS R13, R6 | |||||
STP (R5, R6), 16(R2) | |||||
ADCS R14, R7 | |||||
ADCS R15, R8 | |||||
STP (R7, R8), 32(R2) | |||||
ADCS R16, R9 | |||||
ADC R17, R10 | |||||
STP (R9, R10), 48(R2) | |||||
RET | |||||
TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 | |||||
MOVD z+0(FP), R2 | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
// Load first summand into R3-R10 | |||||
// Add first summand and second summand and store result in R3-R10 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 16(R0), (R5, R6) | |||||
LDP 16(R1), (R13, R14) | |||||
ADDS R11, R3 | |||||
ADCS R12, R4 | |||||
STP (R3, R4), 0(R2) | |||||
ADCS R13, R5 | |||||
ADCS R14, R6 | |||||
STP (R5, R6), 16(R2) | |||||
LDP 32(R0), (R7, R8) | |||||
LDP 32(R1), (R11, R12) | |||||
LDP 48(R0), (R9, R10) | |||||
LDP 48(R1), (R13, R14) | |||||
ADCS R11, R7 | |||||
ADCS R12, R8 | |||||
STP (R7, R8), 32(R2) | |||||
ADCS R13, R9 | |||||
ADC R14, R10 | |||||
STP (R9, R10), 48(R2) | |||||
RET | |||||
TEXT ·fp503X2AddLazy(SB), NOSPLIT, $0-24 | |||||
MOVD z+0(FP), R2 | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 16(R0), (R5, R6) | |||||
LDP 16(R1), (R13, R14) | |||||
ADDS R11, R3 | |||||
ADCS R12, R4 | |||||
STP (R3, R4), 0(R2) | |||||
ADCS R13, R5 | |||||
ADCS R14, R6 | |||||
STP (R5, R6), 16(R2) | |||||
LDP 32(R0), (R7, R8) | |||||
LDP 32(R1), (R11, R12) | |||||
LDP 48(R0), (R9, R10) | |||||
LDP 48(R1), (R13, R14) | |||||
ADCS R11, R7 | |||||
ADCS R12, R8 | |||||
STP (R7, R8), 32(R2) | |||||
ADCS R13, R9 | |||||
ADCS R14, R10 | |||||
STP (R9, R10), 48(R2) | |||||
LDP 64(R0), (R3, R4) | |||||
LDP 64(R1), (R11, R12) | |||||
LDP 80(R0), (R5, R6) | |||||
LDP 80(R1), (R13, R14) | |||||
ADCS R11, R3 | |||||
ADCS R12, R4 | |||||
STP (R3, R4), 64(R2) | |||||
ADCS R13, R5 | |||||
ADCS R14, R6 | |||||
STP (R5, R6), 80(R2) | |||||
LDP 96(R0), (R7, R8) | |||||
LDP 96(R1), (R11, R12) | |||||
LDP 112(R0), (R9, R10) | |||||
LDP 112(R1), (R13, R14) | |||||
ADCS R11, R7 | |||||
ADCS R12, R8 | |||||
STP (R7, R8), 96(R2) | |||||
ADCS R13, R9 | |||||
ADC R14, R10 | |||||
STP (R9, R10), 112(R2) | |||||
RET | |||||
TEXT ·fp503X2SubLazy(SB), NOSPLIT, $0-24 | |||||
MOVD z+0(FP), R2 | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 16(R0), (R5, R6) | |||||
LDP 16(R1), (R13, R14) | |||||
SUBS R11, R3 | |||||
SBCS R12, R4 | |||||
STP (R3, R4), 0(R2) | |||||
SBCS R13, R5 | |||||
SBCS R14, R6 | |||||
STP (R5, R6), 16(R2) | |||||
LDP 32(R0), (R7, R8) | |||||
LDP 32(R1), (R11, R12) | |||||
LDP 48(R0), (R9, R10) | |||||
LDP 48(R1), (R13, R14) | |||||
SBCS R11, R7 | |||||
SBCS R12, R8 | |||||
STP (R7, R8), 32(R2) | |||||
SBCS R13, R9 | |||||
SBCS R14, R10 | |||||
STP (R9, R10), 48(R2) | |||||
LDP 64(R0), (R3, R4) | |||||
LDP 64(R1), (R11, R12) | |||||
LDP 80(R0), (R5, R6) | |||||
LDP 80(R1), (R13, R14) | |||||
SBCS R11, R3 | |||||
SBCS R12, R4 | |||||
SBCS R13, R5 | |||||
SBCS R14, R6 | |||||
LDP 96(R0), (R7, R8) | |||||
LDP 96(R1), (R11, R12) | |||||
LDP 112(R0), (R9, R10) | |||||
LDP 112(R1), (R13, R14) | |||||
SBCS R11, R7 | |||||
SBCS R12, R8 | |||||
SBCS R13, R9 | |||||
SBCS R14, R10 | |||||
SBC ZR, ZR, R15 | |||||
// If x - y < 0, R15 is 1 and p503 should be added | |||||
LDP ·p503+16(SB), (R16, R17) | |||||
LDP ·p503+32(SB), (R19, R20) | |||||
AND R15, R16 | |||||
AND R15, R17 | |||||
LDP ·p503+48(SB), (R21, R22) | |||||
AND R15, R19 | |||||
AND R15, R20 | |||||
AND R15, R21 | |||||
AND R15, R22 | |||||
ADDS R16, R3 | |||||
ADCS R16, R4 | |||||
STP (R3, R4), 64(R2) | |||||
ADCS R16, R5 | |||||
ADCS R17, R6 | |||||
STP (R5, R6), 80(R2) | |||||
ADCS R19, R7 | |||||
ADCS R20, R8 | |||||
STP (R7, R8), 96(R2) | |||||
ADCS R21, R9 | |||||
ADC R22, R10 | |||||
STP (R9, R10), 112(R2) | |||||
RET | |||||
// Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) | |||||
// Z0 is not actually touched | |||||
// Result of (X0-X1) * (Y0-Y1) will be in Z0-Z3 | |||||
// Inputs get overwritten, except for X1 | |||||
#define mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, Z2, Z3, T0) \ | |||||
MUL X1, Y0, X0 \ | |||||
UMULH X1, Y0, Y0 \ | |||||
ADDS Z3, Z1 \ | |||||
ADC ZR, Z2 \ | |||||
\ | |||||
MUL Y1, X1, T0 \ | |||||
UMULH Y1, X1, Y1 \ | |||||
ADDS X0, Z1 \ | |||||
ADCS Y0, Z2 \ | |||||
ADC ZR, ZR, Z3 \ | |||||
\ | |||||
ADDS T0, Z2 \ | |||||
ADC Y1, Z3 | |||||
// Expects that X points to (X0-X1) | |||||
// Result of (X0-X3) * (Y0-Y3) will be in Z0-Z7 | |||||
// Inputs get overwritten, except X2-X3 and Y2-Y3 | |||||
#define mul256x256karatsuba(X, X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1)\ | |||||
ADDS X2, X0 \ // xH + xL, destroys xL | |||||
ADCS X3, X1 \ | |||||
ADCS ZR, ZR, T0 \ | |||||
\ | |||||
ADDS Y2, Y0, Z6 \ // yH + yL | |||||
ADCS Y3, Y1, T1 \ | |||||
ADC ZR, ZR, Z7 \ | |||||
\ | |||||
SUB T0, ZR, Z2 \ | |||||
SUB Z7, ZR, Z3 \ | |||||
AND Z7, T0 \ // combined carry | |||||
\ | |||||
AND Z2, Z6, Z0 \ // masked(yH + yL) | |||||
AND Z2, T1, Z1 \ | |||||
\ | |||||
AND Z3, X0, Z4 \ // masked(xH + xL) | |||||
AND Z3, X1, Z5 \ | |||||
\ | |||||
MUL Z6, X0, Z2 \ | |||||
MUL T1, X0, Z3 \ | |||||
\ | |||||
ADDS Z4, Z0 \ | |||||
UMULH T1, X0, Z4 \ | |||||
ADCS Z5, Z1 \ | |||||
UMULH Z6, X0, Z5 \ | |||||
ADC ZR, T0 \ | |||||
\ // (xH + xL) * (yH + yL) | |||||
mul128x128comba(X0, X1, Z6, T1, Z2, Z3, Z4, Z5, Z7)\ | |||||
\ | |||||
LDP 0+X, (X0, X1) \ | |||||
\ | |||||
ADDS Z0, Z4 \ | |||||
UMULH Y0, X0, Z7 \ | |||||
UMULH Y1, X0, T1 \ | |||||
ADCS Z1, Z5 \ | |||||
MUL Y0, X0, Z0 \ | |||||
MUL Y1, X0, Z1 \ | |||||
ADC ZR, T0 \ | |||||
\ // xL * yL | |||||
mul128x128comba(X0, X1, Y0, Y1, Z0, Z1, T1, Z7, Z6)\ | |||||
\ | |||||
MUL Y2, X2, X0 \ | |||||
UMULH Y2, X2, Y0 \ | |||||
SUBS Z0, Z2 \ // (xH + xL) * (yH + yL) - xL * yL | |||||
SBCS Z1, Z3 \ | |||||
SBCS T1, Z4 \ | |||||
MUL Y3, X2, X1 \ | |||||
UMULH Y3, X2, Z6 \ | |||||
SBCS Z7, Z5 \ | |||||
SBCS ZR, T0 \ | |||||
\ // xH * yH | |||||
mul128x128comba(X2, X3, Y2, Y3, X0, X1, Z6, Y0, Y1)\ | |||||
\ | |||||
SUBS X0, Z2 \ // (xH + xL) * (yH + yL) - xL * yL - xH * yH | |||||
SBCS X1, Z3 \ | |||||
SBCS Z6, Z4 \ | |||||
SBCS Y0, Z5 \ | |||||
SBCS ZR, T0 \ | |||||
\ | |||||
ADDS T1, Z2 \ // (xH * yH) * 2^256 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^128 + xL * yL | |||||
ADCS Z7, Z3 \ | |||||
ADCS X0, Z4 \ | |||||
ADCS X1, Z5 \ | |||||
ADCS T0, Z6 \ | |||||
ADC Y0, ZR, Z7 | |||||
// This implements two-level Karatsuba with a 128x128 Comba multiplier | |||||
// at the bottom | |||||
TEXT ·fp503Mul(SB), NOSPLIT, $0-24 | |||||
MOVD z+0(FP), R2 | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
// Load xL in R3-R6, xH in R7-R10 | |||||
// (xH + xL) in R25-R29 | |||||
LDP 0(R0), (R3, R4) | |||||
LDP 32(R0), (R7, R8) | |||||
ADDS R3, R7, R25 | |||||
ADCS R4, R8, R26 | |||||
LDP 16(R0), (R5, R6) | |||||
LDP 48(R0), (R9, R10) | |||||
ADCS R5, R9, R27 | |||||
ADCS R6, R10, R29 | |||||
ADC ZR, ZR, R7 | |||||
// Load yL in R11-R14, yH in R15-19 | |||||
// (yH + yL) in R11-R14, destroys yL | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 32(R1), (R15, R16) | |||||
ADDS R15, R11 | |||||
ADCS R16, R12 | |||||
LDP 16(R1), (R13, R14) | |||||
LDP 48(R1), (R17, R19) | |||||
ADCS R17, R13 | |||||
ADCS R19, R14 | |||||
ADC ZR, ZR, R8 | |||||
// Compute maskes and combined carry | |||||
SUB R7, ZR, R9 | |||||
SUB R8, ZR, R10 | |||||
AND R8, R7 | |||||
// masked(yH + yL) | |||||
AND R9, R11, R15 | |||||
AND R9, R12, R16 | |||||
AND R9, R13, R17 | |||||
AND R9, R14, R19 | |||||
// masked(xH + xL) | |||||
AND R10, R25, R20 | |||||
AND R10, R26, R21 | |||||
AND R10, R27, R22 | |||||
AND R10, R29, R23 | |||||
// masked(xH + xL) + masked(yH + yL) in R15-R19 | |||||
ADDS R20, R15 | |||||
ADCS R21, R16 | |||||
ADCS R22, R17 | |||||
ADCS R23, R19 | |||||
ADC ZR, R7 | |||||
// Use z as temporary storage | |||||
STP (R25, R26), 0(R2) | |||||
// (xH + xL) * (yH + yL) | |||||
mul256x256karatsuba(0(R2), R25, R26, R27, R29, R11, R12, R13, R14, R8, R9, R10, R20, R21, R22, R23, R24, R0, R1) | |||||
MOVD x+8(FP), R0 | |||||
MOVD y+16(FP), R1 | |||||
ADDS R21, R15 | |||||
ADCS R22, R16 | |||||
ADCS R23, R17 | |||||
ADCS R24, R19 | |||||
ADC ZR, R7 | |||||
// Load yL in R11-R14 | |||||
LDP 0(R1), (R11, R12) | |||||
LDP 16(R1), (R13, R14) | |||||
// xL * yL | |||||
mul256x256karatsuba(0(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R1, R2) | |||||
MOVD z+0(FP), R2 | |||||
MOVD y+16(FP), R1 | |||||
// (xH + xL) * (yH + yL) - xL * yL | |||||
SUBS R21, R8 | |||||
SBCS R22, R9 | |||||
STP (R21, R22), 0(R2) | |||||
SBCS R23, R10 | |||||
SBCS R24, R20 | |||||
STP (R23, R24), 16(R2) | |||||
SBCS R25, R15 | |||||
SBCS R26, R16 | |||||
SBCS R27, R17 | |||||
SBCS R29, R19 | |||||
SBC ZR, R7 | |||||
// Load xH in R3-R6, yH in R11-R14 | |||||
LDP 32(R0), (R3, R4) | |||||
LDP 48(R0), (R5, R6) | |||||
LDP 32(R1), (R11, R12) | |||||
LDP 48(R1), (R13, R14) | |||||
ADDS R25, R8 | |||||
ADCS R26, R9 | |||||
ADCS R27, R10 | |||||
ADCS R29, R20 | |||||
ADC ZR, ZR, R1 | |||||
MOVD R20, 32(R2) | |||||
// xH * yH | |||||
mul256x256karatsuba(32(R0), R3, R4, R5, R6, R11, R12, R13, R14, R21, R22, R23, R24, R25, R26, R27, R29, R2, R20) | |||||
NEG R1, R1 | |||||
MOVD z+0(FP), R2 | |||||
MOVD 32(R2), R20 | |||||
// (xH + xL) * (yH + yL) - xL * yL - xH * yH in R8-R10,R20,R15-R19 | |||||
// Store lower half in z, that's done | |||||
SUBS R21, R8 | |||||
SBCS R22, R9 | |||||
STP (R8, R9), 32(R2) | |||||
SBCS R23, R10 | |||||
SBCS R24, R20 | |||||
STP (R10, R20), 48(R2) | |||||
SBCS R25, R15 | |||||
SBCS R26, R16 | |||||
SBCS R27, R17 | |||||
SBCS R29, R19 | |||||
SBC ZR, R7 | |||||
// (xH * yH) * 2^512 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^256 + xL * yL | |||||
// Store remaining limbs in z | |||||
ADDS $1, R1 | |||||
ADCS R21, R15 | |||||
ADCS R22, R16 | |||||
STP (R15, R16), 64(R2) | |||||
ADCS R23, R17 | |||||
ADCS R24, R19 | |||||
STP (R17, R19), 80(R2) | |||||
ADCS R7, R25 | |||||
ADCS ZR, R26 | |||||
STP (R25, R26), 96(R2) | |||||
ADCS ZR, R27 | |||||
ADC ZR, R29 | |||||
STP (R27, R29), 112(R2) | |||||
RET | |||||
// Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high) | |||||
// Z0 is not actually touched | |||||
// Result of (X0-X1) * (Y0-Y3) will be in Z0-Z5 | |||||
// Inputs remain intact | |||||
#define mul128x256comba(X0, X1, Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3)\ | |||||
MUL X1, Y0, T0 \ | |||||
UMULH X1, Y0, T1 \ | |||||
ADDS Z3, Z1 \ | |||||
ADC ZR, Z2 \ | |||||
\ | |||||
MUL X0, Y2, T2 \ | |||||
UMULH X0, Y2, T3 \ | |||||
ADDS T0, Z1 \ | |||||
ADCS T1, Z2 \ | |||||
ADC ZR, ZR, Z3 \ | |||||
\ | |||||
MUL X1, Y1, T0 \ | |||||
UMULH X1, Y1, T1 \ | |||||
ADDS T2, Z2 \ | |||||
ADCS T3, Z3 \ | |||||
ADC ZR, ZR, Z4 \ | |||||
\ | |||||
MUL X0, Y3, T2 \ | |||||
UMULH X0, Y3, T3 \ | |||||
ADDS T0, Z2 \ | |||||
ADCS T1, Z3 \ | |||||
ADC ZR, Z4 \ | |||||
\ | |||||
MUL X1, Y2, T0 \ | |||||
UMULH X1, Y2, T1 \ | |||||
ADDS T2, Z3 \ | |||||
ADCS T3, Z4 \ | |||||
ADC ZR, ZR, Z5 \ | |||||
\ | |||||
MUL X1, Y3, T2 \ | |||||
UMULH X1, Y3, T3 \ | |||||
ADDS T0, Z3 \ | |||||
ADCS T1, Z4 \ | |||||
ADC ZR, Z5 \ | |||||
ADDS T2, Z4 \ | |||||
ADC T3, Z5 | |||||
// This implements the shifted 2^(B*w) Montgomery reduction from | |||||
// https://eprint.iacr.org/2016/986.pdf, section Section 3.2, with | |||||
// B = 4, w = 64. Performance results were reported in | |||||
// https://eprint.iacr.org/2018/700.pdf Section 6. | |||||
TEXT ·fp503MontgomeryReduce(SB), NOSPLIT, $0-16 | |||||
MOVD x+8(FP), R0 | |||||
// Load x0-x1 | |||||
LDP 0(R0), (R2, R3) | |||||
// Load the prime constant in R25-R29 | |||||
LDP ·p503p1s8+32(SB), (R25, R26) | |||||
LDP ·p503p1s8+48(SB), (R27, R29) | |||||
// [x0,x1] * p503p1s8 to R4-R9 | |||||
MUL R2, R25, R4 // x0 * p503p1s8[0] | |||||
UMULH R2, R25, R7 | |||||
MUL R2, R26, R5 // x0 * p503p1s8[1] | |||||
UMULH R2, R26, R6 | |||||
mul128x256comba(R2, R3, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13) | |||||
LDP 16(R0), (R3, R11) // x2 | |||||
LDP 32(R0), (R12, R13) | |||||
LDP 48(R0), (R14, R15) | |||||
// Left-shift result in R4-R9 by 56 to R4-R10 | |||||
ORR R9>>8, ZR, R10 | |||||
LSL $56, R9 | |||||
ORR R8>>8, R9 | |||||
LSL $56, R8 | |||||
ORR R7>>8, R8 | |||||
LSL $56, R7 | |||||
ORR R6>>8, R7 | |||||
LSL $56, R6 | |||||
ORR R5>>8, R6 | |||||
LSL $56, R5 | |||||
ORR R4>>8, R5 | |||||
LSL $56, R4 | |||||
ADDS R4, R11 // x3 | |||||
ADCS R5, R12 // x4 | |||||
ADCS R6, R13 | |||||
ADCS R7, R14 | |||||
ADCS R8, R15 | |||||
LDP 64(R0), (R16, R17) | |||||
LDP 80(R0), (R19, R20) | |||||
MUL R3, R25, R4 // x2 * p503p1s8[0] | |||||
UMULH R3, R25, R7 | |||||
ADCS R9, R16 | |||||
ADCS R10, R17 | |||||
ADCS ZR, R19 | |||||
ADCS ZR, R20 | |||||
LDP 96(R0), (R21, R22) | |||||
LDP 112(R0), (R23, R24) | |||||
MUL R3, R26, R5 // x2 * p503p1s8[1] | |||||
UMULH R3, R26, R6 | |||||
ADCS ZR, R21 | |||||
ADCS ZR, R22 | |||||
ADCS ZR, R23 | |||||
ADC ZR, R24 | |||||
// [x2,x3] * p503p1s8 to R4-R9 | |||||
mul128x256comba(R3, R11, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) | |||||
ORR R9>>8, ZR, R10 | |||||
LSL $56, R9 | |||||
ORR R8>>8, R9 | |||||
LSL $56, R8 | |||||
ORR R7>>8, R8 | |||||
LSL $56, R7 | |||||
ORR R6>>8, R7 | |||||
LSL $56, R6 | |||||
ORR R5>>8, R6 | |||||
LSL $56, R5 | |||||
ORR R4>>8, R5 | |||||
LSL $56, R4 | |||||
ADDS R4, R13 // x5 | |||||
ADCS R5, R14 // x6 | |||||
ADCS R6, R15 | |||||
ADCS R7, R16 | |||||
MUL R12, R25, R4 // x4 * p503p1s8[0] | |||||
UMULH R12, R25, R7 | |||||
ADCS R8, R17 | |||||
ADCS R9, R19 | |||||
ADCS R10, R20 | |||||
ADCS ZR, R21 | |||||
MUL R12, R26, R5 // x4 * p503p1s8[1] | |||||
UMULH R12, R26, R6 | |||||
ADCS ZR, R22 | |||||
ADCS ZR, R23 | |||||
ADC ZR, R24 | |||||
// [x4,x5] * p503p1s8 to R4-R9 | |||||
mul128x256comba(R12, R13, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) | |||||
ORR R9>>8, ZR, R10 | |||||
LSL $56, R9 | |||||
ORR R8>>8, R9 | |||||
LSL $56, R8 | |||||
ORR R7>>8, R8 | |||||
LSL $56, R7 | |||||
ORR R6>>8, R7 | |||||
LSL $56, R6 | |||||
ORR R5>>8, R6 | |||||
LSL $56, R5 | |||||
ORR R4>>8, R5 | |||||
LSL $56, R4 | |||||
ADDS R4, R15 // x7 | |||||
ADCS R5, R16 // x8 | |||||
ADCS R6, R17 | |||||
ADCS R7, R19 | |||||
MUL R14, R25, R4 // x6 * p503p1s8[0] | |||||
UMULH R14, R25, R7 | |||||
ADCS R8, R20 | |||||
ADCS R9, R21 | |||||
ADCS R10, R22 | |||||
MUL R14, R26, R5 // x6 * p503p1s8[1] | |||||
UMULH R14, R26, R6 | |||||
ADCS ZR, R23 | |||||
ADC ZR, R24 | |||||
// [x6,x7] * p503p1s8 to R4-R9 | |||||
mul128x256comba(R14, R15, R25, R26, R27, R29, R4, R5, R6, R7, R8, R9, R10, R0, R1, R2) | |||||
ORR R9>>8, ZR, R10 | |||||
LSL $56, R9 | |||||
ORR R8>>8, R9 | |||||
LSL $56, R8 | |||||
ORR R7>>8, R8 | |||||
LSL $56, R7 | |||||
ORR R6>>8, R7 | |||||
LSL $56, R6 | |||||
ORR R5>>8, R6 | |||||
LSL $56, R5 | |||||
ORR R4>>8, R5 | |||||
LSL $56, R4 | |||||
MOVD z+0(FP), R0 | |||||
ADDS R4, R17 | |||||
ADCS R5, R19 | |||||
STP (R16, R17), 0(R0) // Store final result to z | |||||
ADCS R6, R20 | |||||
ADCS R7, R21 | |||||
STP (R19, R20), 16(R0) | |||||
ADCS R8, R22 | |||||
ADCS R9, R23 | |||||
STP (R21, R22), 32(R0) | |||||
ADC R10, R24 | |||||
STP (R23, R24), 48(R0) | |||||
RET | |||||
TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8 | |||||
MOVD x+0(FP), R0 | |||||
// Keep x in R1-R8, p503 in R9-R14, subtract to R1-R8 | |||||
LDP ·p503+16(SB), (R9, R10) | |||||
LDP 0(R0), (R1, R2) | |||||
LDP 16(R0), (R3, R4) | |||||
SUBS R9, R1 | |||||
SBCS R9, R2 | |||||
LDP 32(R0), (R5, R6) | |||||
LDP ·p503+32(SB), (R11, R12) | |||||
SBCS R9, R3 | |||||
SBCS R10, R4 | |||||
LDP 48(R0), (R7, R8) | |||||
LDP ·p503+48(SB), (R13, R14) | |||||
SBCS R11, R5 | |||||
SBCS R12, R6 | |||||
SBCS R13, R7 | |||||
SBCS R14, R8 | |||||
SBC ZR, ZR, R15 | |||||
// Mask with the borrow and add p503 | |||||
AND R15, R9 | |||||
AND R15, R10 | |||||
AND R15, R11 | |||||
AND R15, R12 | |||||
AND R15, R13 | |||||
AND R15, R14 | |||||
ADDS R9, R1 | |||||
ADCS R9, R2 | |||||
STP (R1, R2), 0(R0) | |||||
ADCS R9, R3 | |||||
ADCS R10, R4 | |||||
STP (R3, R4), 16(R0) | |||||
ADCS R11, R5 | |||||
ADCS R12, R6 | |||||
STP (R5, R6), 32(R0) | |||||
ADCS R13, R7 | |||||
ADCS R14, R8 | |||||
STP (R7, R8), 48(R0) | |||||
RET |
@@ -1,12 +1,9 @@ | |||||
// +build amd64,!noasm | |||||
// +build amd64,!noasm arm64,!noasm | |||||
package p503 | package p503 | ||||
import ( | import ( | ||||
. "github.com/cloudflare/sidh/internal/isogeny" | . "github.com/cloudflare/sidh/internal/isogeny" | ||||
// This is imported only because arith_amd64.s needs | |||||
// some symbols from cpuid.go | |||||
_ "github.com/cloudflare/sidh/internal/utils" | |||||
) | ) | ||||
// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. | // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. | ||||
@@ -1,4 +1,4 @@ | |||||
// +build noasm !amd64 | |||||
// +build noasm !amd64,!arm64 | |||||
package p503 | package p503 | ||||
@@ -145,6 +145,12 @@ var p503R2 = FpElement{ | |||||
0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771, | 0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771, | ||||
} | } | ||||
// p503 + 1 left-shifted by 8, assuming little endianness | |||||
var p503p1s8 = FpElement{ | |||||
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, | |||||
0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13, 0x45C6BDDA77A4D01B, 0x4066F541811E1E60, | |||||
} | |||||
// 1*R mod p | // 1*R mod p | ||||
var P503_OneFp2 = Fp2Element{ | var P503_OneFp2 = Fp2Element{ | ||||
A: FpElement{ | A: FpElement{ | ||||
@@ -0,0 +1,16 @@ | |||||
// +build amd64 | |||||
package p751 | |||||
import cpu "github.com/cloudflare/sidh/internal/utils" | |||||
// Signals support for MULX which is in BMI2 | |||||
var HasBMI2 bool | |||||
// Signals support for ADX and BMI2 | |||||
var HasADXandBMI2 bool | |||||
func init() { | |||||
HasBMI2 = cpu.HasBMI2 | |||||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||||
} |
@@ -1740,9 +1740,9 @@ TEXT ·fp751MontgomeryReduce(SB), $0-16 | |||||
MOVQ x+8(FP), REG_P1 | MOVQ x+8(FP), REG_P1 | ||||
// Check wether to use optimized implementation | // Check wether to use optimized implementation | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | |||||
CMPB ·HasADXandBMI2(SB), $1 | |||||
JE redc_with_mulx_adcx_adox | JE redc_with_mulx_adcx_adox | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | |||||
CMPB ·HasBMI2(SB), $1 | |||||
JE redc_with_mulx | JE redc_with_mulx | ||||
MOVQ (REG_P1), R11 | MOVQ (REG_P1), R11 | ||||
@@ -22,22 +22,27 @@ const ( | |||||
kUse_MULXandADxX = 1 << 2 | kUse_MULXandADxX = 1 << 2 | ||||
) | ) | ||||
func resetCpuFeatures() { | |||||
HasBMI2 = cpu.HasBMI2 | |||||
HasADXandBMI2 = cpu.HasBMI2 && cpu.HasADX | |||||
} | |||||
// Utility function used for testing REDC implementations. Tests caller provided | // Utility function used for testing REDC implementations. Tests caller provided | ||||
// redcFunc against redc() | // redcFunc against redc() | ||||
func testRedc(t *testing.T, f1, f2 OptimFlag) { | func testRedc(t *testing.T, f1, f2 OptimFlag) { | ||||
doRedcTest := func(aRR FpElementX2) bool { | doRedcTest := func(aRR FpElementX2) bool { | ||||
defer cpu.RecognizeCpu() | |||||
defer resetCpuFeatures() | |||||
var resRedcF1, resRedcF2 FpElement | var resRedcF1, resRedcF2 FpElement | ||||
var aRRcpy = aRR | var aRRcpy = aRR | ||||
// Compute redc with first implementation | // Compute redc with first implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
HasBMI2 = (kUse_MULX & f1) == kUse_MULX | |||||
HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
fp751MontgomeryReduce(&resRedcF1, &aRR) | fp751MontgomeryReduce(&resRedcF1, &aRR) | ||||
// Compute redc with second implementation | // Compute redc with second implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
HasBMI2 = (kUse_MULX & f2) == kUse_MULX | |||||
HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
fp751MontgomeryReduce(&resRedcF2, &aRRcpy) | fp751MontgomeryReduce(&resRedcF2, &aRRcpy) | ||||
// Compare results | // Compare results | ||||
@@ -51,8 +56,8 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | // Ensures correctness of Montgomery reduction implementation which uses MULX | ||||
func TestRedcWithMULX(t *testing.T) { | func TestRedcWithMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasBMI2 { | |||||
t.Skip("MULX not supported by the platform") | t.Skip("MULX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_MULX, kUse_MUL) | testRedc(t, kUse_MULX, kUse_MUL) | ||||
@@ -61,8 +66,8 @@ func TestRedcWithMULX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | // Ensures correctness of Montgomery reduction implementation which uses MULX | ||||
// and ADCX/ADOX. | // and ADCX/ADOX. | ||||
func TestRedcWithMULXADxX(t *testing.T) { | func TestRedcWithMULXADxX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasADXandBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasADXandBMI2 { | |||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_MULXandADxX, kUse_MUL) | testRedc(t, kUse_MULXandADxX, kUse_MUL) | ||||
@@ -71,8 +76,8 @@ func TestRedcWithMULXADxX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | // Ensures correctness of Montgomery reduction implementation which uses MULX | ||||
// and ADCX/ADOX. | // and ADCX/ADOX. | ||||
func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | |||||
if !cpu.HasADXandBMI2 { | |||||
defer resetCpuFeatures() | |||||
if !HasADXandBMI2 { | |||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_MULXandADxX, kUse_MULX) | testRedc(t, kUse_MULXandADxX, kUse_MULX) | ||||
@@ -1,4 +1,4 @@ | |||||
// +build amd64,!noasm | |||||
// +build amd64,!noasm arm64,!noasm | |||||
package p751 | package p751 | ||||
@@ -39,9 +39,7 @@ func fp751X2SubLazy(z, x, y *FpElementX2) | |||||
//go:noescape | //go:noescape | ||||
func fp751Mul(z *FpElementX2, x, y *FpElement) | func fp751Mul(z *FpElementX2, x, y *FpElement) | ||||
// Function pointer that should point to one of the | |||||
// fp751MontgomeryReduce implementations below. | |||||
// When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p). | |||||
// Compute Montgomery reduction: set z = x * R^{-1} (mod 2*p). | |||||
// It may destroy the input value. | // It may destroy the input value. | ||||
//go:noescape | //go:noescape | ||||
func fp751MontgomeryReduce(z *FpElement, x *FpElementX2) | func fp751MontgomeryReduce(z *FpElement, x *FpElementX2) | ||||
@@ -1,4 +1,4 @@ | |||||
// +build noasm !amd64 | |||||
// +build noasm !amd64,!arm64 | |||||
package p751 | package p751 | ||||