mirror of
https://github.com/henrydcase/nobs.git
synced 2024-11-26 00:51:22 +00:00
sidh/csidh: use SEE for performing CSWAP (#6)
* Makefile * makefile: tools for profiling * sidh: use SIMD for performing CSWAP Loads data into 128-bit XMM registers and performs conditional swap. This is probably less useful for SIDH, but will be useful for cSIDH
This commit is contained in:
parent
a456dc4dd9
commit
e9ddb6fb45
18
Makefile
18
Makefile
@ -10,12 +10,16 @@ NOASM ?=
|
|||||||
TEST_PATH ?= ./...
|
TEST_PATH ?= ./...
|
||||||
GOCACHE ?= off
|
GOCACHE ?= off
|
||||||
BENCH_OPTS ?= -v -bench=. -run="NonExistingTest"
|
BENCH_OPTS ?= -v -bench=. -run="NonExistingTest"
|
||||||
|
TEST_PATH ?= ./...
|
||||||
|
|
||||||
ifeq ($(NOASM),1)
|
ifeq ($(NOASM),1)
|
||||||
OPTS+=$(OPTS_TAGS)
|
OPTS+=$(OPTS_TAGS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(PPROF),1)
|
||||||
|
BENCH_OPTS+= -cpuprofile=cpu.out -memprofile=mem0.out
|
||||||
|
endif
|
||||||
|
|
||||||
TARGETS= \
|
TARGETS= \
|
||||||
dh \
|
dh \
|
||||||
drbg \
|
drbg \
|
||||||
@ -32,16 +36,16 @@ make_dirs:
|
|||||||
cp -rf etc $(GOPATH_LOCAL)/$(GOPATH_DIR)
|
cp -rf etc $(GOPATH_LOCAL)/$(GOPATH_DIR)
|
||||||
|
|
||||||
test: clean make_dirs $(addprefix prep-,$(TARGETS))
|
test: clean make_dirs $(addprefix prep-,$(TARGETS))
|
||||||
cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test $(OPTS) -v ./...
|
cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test $(OPTS) -v $(TEST_PATH)
|
||||||
|
|
||||||
cover:
|
cover:
|
||||||
cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test \
|
cd $(GOPATH_LOCAL); GOPATH=$(GOPATH_LOCAL) go test \
|
||||||
-race -coverprofile=coverage_$(NOASM).txt -covermode=atomic $(OPTS) -v ./...
|
-race -coverprofile=coverage_$(NOASM).txt -covermode=atomic $(OPTS) -v $(TEST_PATH)
|
||||||
cat $(GOPATH_LOCAL)/coverage_$(NOASM).txt >> coverage.txt
|
cat $(GOPATH_LOCAL)/coverage_$(NOASM).txt >> coverage.txt
|
||||||
|
|
||||||
bench: clean $(addprefix prep-,$(TARGETS))
|
bench: clean $(addprefix prep-,$(TARGETS))
|
||||||
cd $(GOPATH_LOCAL); GOCACHE=$(GOCACHE) GOPATH=$(GOPATH_LOCAL) $(GO) test \
|
cd $(GOPATH_LOCAL); GOCACHE=$(GOCACHE) GOPATH=$(GOPATH_LOCAL) $(GO) test \
|
||||||
$(BENCH_OPTS) ./...
|
$(BENCH_OPTS) $(TEST_PATH)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf $(GOPATH_LOCAL)
|
rm -rf $(GOPATH_LOCAL)
|
||||||
@ -51,3 +55,9 @@ vendor-sidh-for-tls: clean
|
|||||||
mkdir -p $(VENDOR_DIR)/github_com/henrydcase/nobs/
|
mkdir -p $(VENDOR_DIR)/github_com/henrydcase/nobs/
|
||||||
rsync -a . $(VENDOR_DIR)/github_com/henrydcase/nobs/ --exclude=$(VENDOR_DIR) --exclude=.git --exclude=.travis.yml --exclude=README.md
|
rsync -a . $(VENDOR_DIR)/github_com/henrydcase/nobs/ --exclude=$(VENDOR_DIR) --exclude=.git --exclude=.travis.yml --exclude=README.md
|
||||||
find $(VENDOR_DIR) -type f -print0 -name "*.go" | xargs -0 sed -i 's/github\.com/github_com/g'
|
find $(VENDOR_DIR) -type f -print0 -name "*.go" | xargs -0 sed -i 's/github\.com/github_com/g'
|
||||||
|
|
||||||
|
pprof-cpu:
|
||||||
|
$(GO) tool pprof $(GOPATH_LOCAL)/cpu.out
|
||||||
|
|
||||||
|
pprof-mem:
|
||||||
|
$(GO) tool pprof $(GOPATH_LOCAL)/mem0.out
|
||||||
|
@ -90,31 +90,42 @@ TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17
|
|||||||
|
|
||||||
MOVQ x+0(FP), REG_P1
|
MOVQ x+0(FP), REG_P1
|
||||||
MOVQ y+8(FP), REG_P2
|
MOVQ y+8(FP), REG_P2
|
||||||
MOVB choice+16(FP), AL // AL = 0 or 1
|
MOVBLZX choice+16(FP), AX // AL = 0 or 1
|
||||||
MOVBLZX AL, AX // AX = 0 or 1
|
|
||||||
NEGQ AX // RAX = 0x00..00 or 0xff..ff
|
// Make AX, so that either all bits are set or non
|
||||||
|
// AX = 0 or 1
|
||||||
|
NEGQ AX
|
||||||
|
|
||||||
|
// Fill xmm15. After this step first half of XMM15 is
|
||||||
|
// just zeros and second half is whatever in AX
|
||||||
|
MOVQ AX, X15
|
||||||
|
|
||||||
|
// Copy lower double word everywhere else. So that
|
||||||
|
// XMM15=AL|AL|AL|AL. As AX has either all bits set
|
||||||
|
// or non result will be that XMM15 has also either
|
||||||
|
// all bits set or non of them.
|
||||||
|
PSHUFD $0, X15, X15
|
||||||
|
|
||||||
#ifndef CSWAP_BLOCK
|
#ifndef CSWAP_BLOCK
|
||||||
#define CSWAP_BLOCK(idx) \
|
#define CSWAP_BLOCK(idx) \
|
||||||
MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx]
|
MOVOU (idx*16)(REG_P1), X0 \
|
||||||
MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx]
|
MOVOU (idx*16)(REG_P2), X1 \
|
||||||
MOVQ CX, DX \ // DX = y[idx]
|
\ // X2 = mask & (X0 ^ X1)
|
||||||
XORQ BX, DX \ // DX = y[idx] ^ x[idx]
|
MOVO X1, X2 \
|
||||||
ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask
|
PXOR X0, X2 \
|
||||||
XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
|
PAND X15, X2 \
|
||||||
XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
|
\
|
||||||
MOVQ BX, (idx*8)(REG_P1) \
|
PXOR X2, X0 \
|
||||||
MOVQ CX, (idx*8)(REG_P2)
|
PXOR X2, X1 \
|
||||||
|
\
|
||||||
|
MOVOU X0, (idx*16)(REG_P1) \
|
||||||
|
MOVOU X1, (idx*16)(REG_P2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CSWAP_BLOCK(0)
|
CSWAP_BLOCK(0)
|
||||||
CSWAP_BLOCK(1)
|
CSWAP_BLOCK(1)
|
||||||
CSWAP_BLOCK(2)
|
CSWAP_BLOCK(2)
|
||||||
CSWAP_BLOCK(3)
|
CSWAP_BLOCK(3)
|
||||||
CSWAP_BLOCK(4)
|
|
||||||
CSWAP_BLOCK(5)
|
|
||||||
CSWAP_BLOCK(6)
|
|
||||||
CSWAP_BLOCK(7)
|
|
||||||
|
|
||||||
#ifdef CSWAP_BLOCK
|
#ifdef CSWAP_BLOCK
|
||||||
#undef CSWAP_BLOCK
|
#undef CSWAP_BLOCK
|
||||||
|
@ -126,130 +126,48 @@ TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
|
|||||||
|
|
||||||
MOVQ x+0(FP), REG_P1
|
MOVQ x+0(FP), REG_P1
|
||||||
MOVQ y+8(FP), REG_P2
|
MOVQ y+8(FP), REG_P2
|
||||||
MOVB choice+16(FP), AL // AL = 0 or 1
|
MOVBLZX choice+16(FP), AX // AL = 0 or 1
|
||||||
MOVBLZX AL, AX // AX = 0 or 1
|
|
||||||
NEGQ AX // RAX = 0x00..00 or 0xff..ff
|
|
||||||
|
|
||||||
MOVQ (0*8)(REG_P1), BX // BX = x[0]
|
// Make AX, so that either all bits are set or non
|
||||||
MOVQ (0*8)(REG_P2), CX // CX = y[0]
|
// AX = 0 or 1
|
||||||
MOVQ CX, DX // DX = y[0]
|
NEGQ AX
|
||||||
XORQ BX, DX // DX = y[0] ^ x[0]
|
|
||||||
ANDQ AX, DX // DX = (y[0] ^ x[0]) & mask
|
|
||||||
XORQ DX, BX // BX = (y[0] ^ x[0]) & mask) ^ x[0] = x[0] or y[0]
|
|
||||||
XORQ DX, CX // CX = (y[0] ^ x[0]) & mask) ^ y[0] = y[0] or x[0]
|
|
||||||
MOVQ BX, (0*8)(REG_P1)
|
|
||||||
MOVQ CX, (0*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (1*8)(REG_P1), BX
|
// Fill xmm15. After this step first half of XMM15 is
|
||||||
MOVQ (1*8)(REG_P2), CX
|
// just zeros and second half is whatever in AX
|
||||||
MOVQ CX, DX
|
MOVQ AX, X15
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (1*8)(REG_P1)
|
|
||||||
MOVQ CX, (1*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (2*8)(REG_P1), BX
|
// Copy lower double word everywhere else. So that
|
||||||
MOVQ (2*8)(REG_P2), CX
|
// XMM15=AL|AL|AL|AL. As AX has either all bits set
|
||||||
MOVQ CX, DX
|
// or non result will be that XMM15 has also either
|
||||||
XORQ BX, DX
|
// all bits set or non of them.
|
||||||
ANDQ AX, DX
|
PSHUFD $0, X15, X15
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (2*8)(REG_P1)
|
|
||||||
MOVQ CX, (2*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (3*8)(REG_P1), BX
|
#ifndef CSWAP_BLOCK
|
||||||
MOVQ (3*8)(REG_P2), CX
|
#define CSWAP_BLOCK(idx) \
|
||||||
MOVQ CX, DX
|
MOVOU (idx*16)(REG_P1), X0 \
|
||||||
XORQ BX, DX
|
MOVOU (idx*16)(REG_P2), X1 \
|
||||||
ANDQ AX, DX
|
\ // X2 = mask & (X0 ^ X1)
|
||||||
XORQ DX, BX
|
MOVO X1, X2 \
|
||||||
XORQ DX, CX
|
PXOR X0, X2 \
|
||||||
MOVQ BX, (3*8)(REG_P1)
|
PAND X15, X2 \
|
||||||
MOVQ CX, (3*8)(REG_P2)
|
\
|
||||||
|
PXOR X2, X0 \
|
||||||
|
PXOR X2, X1 \
|
||||||
|
\
|
||||||
|
MOVOU X0, (idx*16)(REG_P1) \
|
||||||
|
MOVOU X1, (idx*16)(REG_P2)
|
||||||
|
#endif
|
||||||
|
|
||||||
MOVQ (4*8)(REG_P1), BX
|
CSWAP_BLOCK(0)
|
||||||
MOVQ (4*8)(REG_P2), CX
|
CSWAP_BLOCK(1)
|
||||||
MOVQ CX, DX
|
CSWAP_BLOCK(2)
|
||||||
XORQ BX, DX
|
CSWAP_BLOCK(3)
|
||||||
ANDQ AX, DX
|
CSWAP_BLOCK(4)
|
||||||
XORQ DX, BX
|
CSWAP_BLOCK(5)
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (4*8)(REG_P1)
|
|
||||||
MOVQ CX, (4*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (5*8)(REG_P1), BX
|
|
||||||
MOVQ (5*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (5*8)(REG_P1)
|
|
||||||
MOVQ CX, (5*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (6*8)(REG_P1), BX
|
|
||||||
MOVQ (6*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (6*8)(REG_P1)
|
|
||||||
MOVQ CX, (6*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (7*8)(REG_P1), BX
|
|
||||||
MOVQ (7*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (7*8)(REG_P1)
|
|
||||||
MOVQ CX, (7*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (8*8)(REG_P1), BX
|
|
||||||
MOVQ (8*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (8*8)(REG_P1)
|
|
||||||
MOVQ CX, (8*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (9*8)(REG_P1), BX
|
|
||||||
MOVQ (9*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (9*8)(REG_P1)
|
|
||||||
MOVQ CX, (9*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (10*8)(REG_P1), BX
|
|
||||||
MOVQ (10*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (10*8)(REG_P1)
|
|
||||||
MOVQ CX, (10*8)(REG_P2)
|
|
||||||
|
|
||||||
MOVQ (11*8)(REG_P1), BX
|
|
||||||
MOVQ (11*8)(REG_P2), CX
|
|
||||||
MOVQ CX, DX
|
|
||||||
XORQ BX, DX
|
|
||||||
ANDQ AX, DX
|
|
||||||
XORQ DX, BX
|
|
||||||
XORQ DX, CX
|
|
||||||
MOVQ BX, (11*8)(REG_P1)
|
|
||||||
MOVQ CX, (11*8)(REG_P2)
|
|
||||||
|
|
||||||
|
#ifdef CSWAP_BLOCK
|
||||||
|
#undef CSWAP_BLOCK
|
||||||
|
#endif
|
||||||
RET
|
RET
|
||||||
|
|
||||||
TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
|
TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
|
||||||
|
Loading…
Reference in New Issue
Block a user