diff --git a/Makefile b/Makefile index 1bc145a..1f1ed6c 100644 --- a/Makefile +++ b/Makefile @@ -10,12 +10,16 @@ NOASM ?= TEST_PATH ?= ./... GOCACHE ?= off BENCH_OPTS ?= -v -bench=. -run="NonExistingTest" - +TEST_PATH ?= ./... ifeq ($(NOASM),1) OPTS+=$(OPTS_TAGS) endif +ifeq ($(PPROF),1) + BENCH_OPTS+= -cpuprofile=cpu.out -memprofile=mem0.out +endif + TARGETS= \ dh \ drbg \ @@ -32,16 +36,16 @@ make_dirs: cp -rf etc $(GOPATH_LOCAL)/$(GOPATH_DIR) test: clean make_dirs $(addprefix prep-,$(TARGETS)) - cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test $(OPTS) -v ./... + cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test $(OPTS) -v $(TEST_PATH) cover: cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test \ - -race -coverprofile=coverage_$(NOASM).txt -covermode=atomic $(OPTS) -v ./... + -race -coverprofile=coverage_$(NOASM).txt -covermode=atomic $(OPTS) -v $(TEST_PATH) cat $(GOPATH_LOCAL)/coverage_$(NOASM).txt >> coverage.txt bench: clean $(addprefix prep-,$(TARGETS)) cd $(GOPATH_LOCAL); GOCACHE=$(GOCACHE) GOPATH=$(GOPATH_LOCAL) $(GO) test \ - $(BENCH_OPTS) ./... + $(BENCH_OPTS) $(TEST_PATH) clean: rm -rf $(GOPATH_LOCAL) @@ -51,3 +55,9 @@ vendor-sidh-for-tls: clean mkdir -p $(VENDOR_DIR)/github_com/henrydcase/nobs/ rsync -a . $(VENDOR_DIR)/github_com/henrydcase/nobs/ --exclude=$(VENDOR_DIR) --exclude=.git --exclude=.travis.yml --exclude=README.md find $(VENDOR_DIR) -type f -print0 -name "*.go" | xargs -0 sed -i 's/github\.com/github_com/g' + +pprof-cpu: + $(GO) tool pprof $(GOPATH_LOCAL)/cpu.out + +pprof-mem: + $(GO) tool pprof $(GOPATH_LOCAL)/mem0.out diff --git a/dh/sidh/p503/arith_amd64.s b/dh/sidh/p503/arith_amd64.s index 0808e88..5968754 100644 --- a/dh/sidh/p503/arith_amd64.s +++ b/dh/sidh/p503/arith_amd64.s @@ -90,31 +90,42 @@ TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17 MOVQ x+0(FP), REG_P1 MOVQ y+8(FP), REG_P2 - MOVB choice+16(FP), AL // AL = 0 or 1 - MOVBLZX AL, AX // AX = 0 or 1 - NEGQ AX // RAX = 0x00..00 or 0xff..ff + MOVBLZX choice+16(FP), AX // AL = 0 or 1 + + // Make AX, so that either all bits are set or non + // AX = 0 or 1 + NEGQ AX + + // Fill xmm15. After this step first half of XMM15 is + // just zeros and second half is whatever in AX + MOVQ AX, X15 + + // Copy lower double word everywhere else. So that + // XMM15=AL|AL|AL|AL. As AX has either all bits set + // or non result will be that XMM15 has also either + // all bits set or non of them. + PSHUFD $0, X15, X15 #ifndef CSWAP_BLOCK #define CSWAP_BLOCK(idx) \ - MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx] - MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx] - MOVQ CX, DX \ // DX = y[idx] - XORQ BX, DX \ // DX = y[idx] ^ x[idx] - ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask - XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx] - XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx] - MOVQ BX, (idx*8)(REG_P1) \ - MOVQ CX, (idx*8)(REG_P2) + MOVOU (idx*16)(REG_P1), X0 \ + MOVOU (idx*16)(REG_P2), X1 \ + \ // X2 = mask & (X0 ^ X1) + MOVO X1, X2 \ + PXOR X0, X2 \ + PAND X15, X2 \ + \ + PXOR X2, X0 \ + PXOR X2, X1 \ + \ + MOVOU X0, (idx*16)(REG_P1) \ + MOVOU X1, (idx*16)(REG_P2) #endif CSWAP_BLOCK(0) CSWAP_BLOCK(1) CSWAP_BLOCK(2) CSWAP_BLOCK(3) - CSWAP_BLOCK(4) - CSWAP_BLOCK(5) - CSWAP_BLOCK(6) - CSWAP_BLOCK(7) #ifdef CSWAP_BLOCK #undef CSWAP_BLOCK diff --git a/dh/sidh/p751/arith_amd64.s b/dh/sidh/p751/arith_amd64.s index 4596d8f..aefb448 100644 --- a/dh/sidh/p751/arith_amd64.s +++ b/dh/sidh/p751/arith_amd64.s @@ -126,130 +126,48 @@ TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17 MOVQ x+0(FP), REG_P1 MOVQ y+8(FP), REG_P2 - MOVB choice+16(FP), AL // AL = 0 or 1 - MOVBLZX AL, AX // AX = 0 or 1 - NEGQ AX // RAX = 0x00..00 or 0xff..ff + MOVBLZX choice+16(FP), AX // AL = 0 or 1 - MOVQ (0*8)(REG_P1), BX // BX = x[0] - MOVQ (0*8)(REG_P2), CX // CX = y[0] - MOVQ CX, DX // DX = y[0] - XORQ BX, DX // DX = y[0] ^ x[0] - ANDQ AX, DX // DX = (y[0] ^ x[0]) & mask - XORQ DX, BX // BX = (y[0] ^ x[0]) & mask) ^ x[0] = x[0] or y[0] - XORQ DX, CX // CX = (y[0] ^ x[0]) & mask) ^ y[0] = y[0] or x[0] - MOVQ BX, (0*8)(REG_P1) - MOVQ CX, (0*8)(REG_P2) + // Make AX, so that either all bits are set or non + // AX = 0 or 1 + NEGQ AX - MOVQ (1*8)(REG_P1), BX - MOVQ (1*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (1*8)(REG_P1) - MOVQ CX, (1*8)(REG_P2) + // Fill xmm15. After this step first half of XMM15 is + // just zeros and second half is whatever in AX + MOVQ AX, X15 - MOVQ (2*8)(REG_P1), BX - MOVQ (2*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (2*8)(REG_P1) - MOVQ CX, (2*8)(REG_P2) + // Copy lower double word everywhere else. So that + // XMM15=AL|AL|AL|AL. As AX has either all bits set + // or non result will be that XMM15 has also either + // all bits set or non of them. + PSHUFD $0, X15, X15 - MOVQ (3*8)(REG_P1), BX - MOVQ (3*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (3*8)(REG_P1) - MOVQ CX, (3*8)(REG_P2) +#ifndef CSWAP_BLOCK +#define CSWAP_BLOCK(idx) \ + MOVOU (idx*16)(REG_P1), X0 \ + MOVOU (idx*16)(REG_P2), X1 \ + \ // X2 = mask & (X0 ^ X1) + MOVO X1, X2 \ + PXOR X0, X2 \ + PAND X15, X2 \ + \ + PXOR X2, X0 \ + PXOR X2, X1 \ + \ + MOVOU X0, (idx*16)(REG_P1) \ + MOVOU X1, (idx*16)(REG_P2) +#endif - MOVQ (4*8)(REG_P1), BX - MOVQ (4*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (4*8)(REG_P1) - MOVQ CX, (4*8)(REG_P2) - - MOVQ (5*8)(REG_P1), BX - MOVQ (5*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (5*8)(REG_P1) - MOVQ CX, (5*8)(REG_P2) - - MOVQ (6*8)(REG_P1), BX - MOVQ (6*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (6*8)(REG_P1) - MOVQ CX, (6*8)(REG_P2) - - MOVQ (7*8)(REG_P1), BX - MOVQ (7*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (7*8)(REG_P1) - MOVQ CX, (7*8)(REG_P2) - - MOVQ (8*8)(REG_P1), BX - MOVQ (8*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (8*8)(REG_P1) - MOVQ CX, (8*8)(REG_P2) - - MOVQ (9*8)(REG_P1), BX - MOVQ (9*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (9*8)(REG_P1) - MOVQ CX, (9*8)(REG_P2) - - MOVQ (10*8)(REG_P1), BX - MOVQ (10*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (10*8)(REG_P1) - MOVQ CX, (10*8)(REG_P2) - - MOVQ (11*8)(REG_P1), BX - MOVQ (11*8)(REG_P2), CX - MOVQ CX, DX - XORQ BX, DX - ANDQ AX, DX - XORQ DX, BX - XORQ DX, CX - MOVQ BX, (11*8)(REG_P1) - MOVQ CX, (11*8)(REG_P2) + CSWAP_BLOCK(0) + CSWAP_BLOCK(1) + CSWAP_BLOCK(2) + CSWAP_BLOCK(3) + CSWAP_BLOCK(4) + CSWAP_BLOCK(5) +#ifdef CSWAP_BLOCK +#undef CSWAP_BLOCK +#endif RET TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24