ca4971cbae
We no longer need to fork them. This is in preparation for pulling it via Go modules, but probably need to figure out the network issue first. Slightly bad manners for CI to do that. Change-Id: Ic258264f3c3559817d5e4921e4ad3282e94d05fe Reviewed-on: https://boringssl-review.googlesource.com/31904 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
401 lines
8.8 KiB
ArmAsm
401 lines
8.8 KiB
ArmAsm
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// +build s390x,go1.11,!gccgo,!appengine
|
|
|
|
#include "textflag.h"
|
|
|
|
// Implementation of Poly1305 using the vector facility (vx).
|
|
|
|
// constants
|
|
#define MOD26 V0
|
|
#define EX0 V1
|
|
#define EX1 V2
|
|
#define EX2 V3
|
|
|
|
// temporaries
|
|
#define T_0 V4
|
|
#define T_1 V5
|
|
#define T_2 V6
|
|
#define T_3 V7
|
|
#define T_4 V8
|
|
|
|
// key (r)
|
|
#define R_0 V9
|
|
#define R_1 V10
|
|
#define R_2 V11
|
|
#define R_3 V12
|
|
#define R_4 V13
|
|
#define R5_1 V14
|
|
#define R5_2 V15
|
|
#define R5_3 V16
|
|
#define R5_4 V17
|
|
#define RSAVE_0 R5
|
|
#define RSAVE_1 R6
|
|
#define RSAVE_2 R7
|
|
#define RSAVE_3 R8
|
|
#define RSAVE_4 R9
|
|
#define R5SAVE_1 V28
|
|
#define R5SAVE_2 V29
|
|
#define R5SAVE_3 V30
|
|
#define R5SAVE_4 V31
|
|
|
|
// message block
|
|
#define F_0 V18
|
|
#define F_1 V19
|
|
#define F_2 V20
|
|
#define F_3 V21
|
|
#define F_4 V22
|
|
|
|
// accumulator
|
|
#define H_0 V23
|
|
#define H_1 V24
|
|
#define H_2 V25
|
|
#define H_3 V26
|
|
#define H_4 V27
|
|
|
|
GLOBL ·keyMask<>(SB), RODATA, $16
|
|
DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
|
|
DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
|
|
|
|
GLOBL ·bswapMask<>(SB), RODATA, $16
|
|
DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
|
|
DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
|
|
|
|
GLOBL ·constants<>(SB), RODATA, $64
|
|
// MOD26
|
|
DATA ·constants<>+0(SB)/8, $0x3ffffff
|
|
DATA ·constants<>+8(SB)/8, $0x3ffffff
|
|
// EX0
|
|
DATA ·constants<>+16(SB)/8, $0x0006050403020100
|
|
DATA ·constants<>+24(SB)/8, $0x1016151413121110
|
|
// EX1
|
|
DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
|
|
DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
|
|
// EX2
|
|
DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
|
|
DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
|
|
|
|
// h = (f*g) % (2**130-5) [partial reduction]
|
|
#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
|
|
VMLOF f0, g0, h0 \
|
|
VMLOF f0, g1, h1 \
|
|
VMLOF f0, g2, h2 \
|
|
VMLOF f0, g3, h3 \
|
|
VMLOF f0, g4, h4 \
|
|
VMLOF f1, g54, T_0 \
|
|
VMLOF f1, g0, T_1 \
|
|
VMLOF f1, g1, T_2 \
|
|
VMLOF f1, g2, T_3 \
|
|
VMLOF f1, g3, T_4 \
|
|
VMALOF f2, g53, h0, h0 \
|
|
VMALOF f2, g54, h1, h1 \
|
|
VMALOF f2, g0, h2, h2 \
|
|
VMALOF f2, g1, h3, h3 \
|
|
VMALOF f2, g2, h4, h4 \
|
|
VMALOF f3, g52, T_0, T_0 \
|
|
VMALOF f3, g53, T_1, T_1 \
|
|
VMALOF f3, g54, T_2, T_2 \
|
|
VMALOF f3, g0, T_3, T_3 \
|
|
VMALOF f3, g1, T_4, T_4 \
|
|
VMALOF f4, g51, h0, h0 \
|
|
VMALOF f4, g52, h1, h1 \
|
|
VMALOF f4, g53, h2, h2 \
|
|
VMALOF f4, g54, h3, h3 \
|
|
VMALOF f4, g0, h4, h4 \
|
|
VAG T_0, h0, h0 \
|
|
VAG T_1, h1, h1 \
|
|
VAG T_2, h2, h2 \
|
|
VAG T_3, h3, h3 \
|
|
VAG T_4, h4, h4
|
|
|
|
// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
|
|
#define REDUCE(h0, h1, h2, h3, h4) \
|
|
VESRLG $26, h0, T_0 \
|
|
VESRLG $26, h3, T_1 \
|
|
VN MOD26, h0, h0 \
|
|
VN MOD26, h3, h3 \
|
|
VAG T_0, h1, h1 \
|
|
VAG T_1, h4, h4 \
|
|
VESRLG $26, h1, T_2 \
|
|
VESRLG $26, h4, T_3 \
|
|
VN MOD26, h1, h1 \
|
|
VN MOD26, h4, h4 \
|
|
VESLG $2, T_3, T_4 \
|
|
VAG T_3, T_4, T_4 \
|
|
VAG T_2, h2, h2 \
|
|
VAG T_4, h0, h0 \
|
|
VESRLG $26, h2, T_0 \
|
|
VESRLG $26, h0, T_1 \
|
|
VN MOD26, h2, h2 \
|
|
VN MOD26, h0, h0 \
|
|
VAG T_0, h3, h3 \
|
|
VAG T_1, h1, h1 \
|
|
VESRLG $26, h3, T_2 \
|
|
VN MOD26, h3, h3 \
|
|
VAG T_2, h4, h4
|
|
|
|
// expand in0 into d[0] and in1 into d[1]
|
|
#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
|
|
VGBM $0x0707, d1 \ // d1=tmp
|
|
VPERM in0, in1, EX2, d4 \
|
|
VPERM in0, in1, EX0, d0 \
|
|
VPERM in0, in1, EX1, d2 \
|
|
VN d1, d4, d4 \
|
|
VESRLG $26, d0, d1 \
|
|
VESRLG $30, d2, d3 \
|
|
VESRLG $4, d2, d2 \
|
|
VN MOD26, d0, d0 \
|
|
VN MOD26, d1, d1 \
|
|
VN MOD26, d2, d2 \
|
|
VN MOD26, d3, d3
|
|
|
|
// pack h4:h0 into h1:h0 (no carry)
|
|
#define PACK(h0, h1, h2, h3, h4) \
|
|
VESLG $26, h1, h1 \
|
|
VESLG $26, h3, h3 \
|
|
VO h0, h1, h0 \
|
|
VO h2, h3, h2 \
|
|
VESLG $4, h2, h2 \
|
|
VLEIB $7, $48, h1 \
|
|
VSLB h1, h2, h2 \
|
|
VO h0, h2, h0 \
|
|
VLEIB $7, $104, h1 \
|
|
VSLB h1, h4, h3 \
|
|
VO h3, h0, h0 \
|
|
VLEIB $7, $24, h1 \
|
|
VSRLB h1, h4, h1
|
|
|
|
// if h > 2**130-5 then h -= 2**130-5
|
|
#define MOD(h0, h1, t0, t1, t2) \
|
|
VZERO t0 \
|
|
VLEIG $1, $5, t0 \
|
|
VACCQ h0, t0, t1 \
|
|
VAQ h0, t0, t0 \
|
|
VONE t2 \
|
|
VLEIG $1, $-4, t2 \
|
|
VAQ t2, t1, t1 \
|
|
VACCQ h1, t1, t1 \
|
|
VONE t2 \
|
|
VAQ t2, t1, t1 \
|
|
VN h0, t1, t2 \
|
|
VNC t0, t1, t1 \
|
|
VO t1, t2, h0
|
|
|
|
// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
|
|
TEXT ·poly1305vx(SB), $0-32
|
|
// This code processes up to 2 blocks (32 bytes) per iteration
|
|
// using the algorithm described in:
|
|
// NEON crypto, Daniel J. Bernstein & Peter Schwabe
|
|
// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
|
|
LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
|
|
|
|
// load MOD26, EX0, EX1 and EX2
|
|
MOVD $·constants<>(SB), R5
|
|
VLM (R5), MOD26, EX2
|
|
|
|
// setup r
|
|
VL (R4), T_0
|
|
MOVD $·keyMask<>(SB), R6
|
|
VL (R6), T_1
|
|
VN T_0, T_1, T_0
|
|
EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
|
|
|
|
// setup r*5
|
|
VLEIG $0, $5, T_0
|
|
VLEIG $1, $5, T_0
|
|
|
|
// store r (for final block)
|
|
VMLOF T_0, R_1, R5SAVE_1
|
|
VMLOF T_0, R_2, R5SAVE_2
|
|
VMLOF T_0, R_3, R5SAVE_3
|
|
VMLOF T_0, R_4, R5SAVE_4
|
|
VLGVG $0, R_0, RSAVE_0
|
|
VLGVG $0, R_1, RSAVE_1
|
|
VLGVG $0, R_2, RSAVE_2
|
|
VLGVG $0, R_3, RSAVE_3
|
|
VLGVG $0, R_4, RSAVE_4
|
|
|
|
// skip r**2 calculation
|
|
CMPBLE R3, $16, skip
|
|
|
|
// calculate r**2
|
|
MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
|
|
REDUCE(H_0, H_1, H_2, H_3, H_4)
|
|
VLEIG $0, $5, T_0
|
|
VLEIG $1, $5, T_0
|
|
VMLOF T_0, H_1, R5_1
|
|
VMLOF T_0, H_2, R5_2
|
|
VMLOF T_0, H_3, R5_3
|
|
VMLOF T_0, H_4, R5_4
|
|
VLR H_0, R_0
|
|
VLR H_1, R_1
|
|
VLR H_2, R_2
|
|
VLR H_3, R_3
|
|
VLR H_4, R_4
|
|
|
|
// initialize h
|
|
VZERO H_0
|
|
VZERO H_1
|
|
VZERO H_2
|
|
VZERO H_3
|
|
VZERO H_4
|
|
|
|
loop:
|
|
CMPBLE R3, $32, b2
|
|
VLM (R2), T_0, T_1
|
|
SUB $32, R3
|
|
MOVD $32(R2), R2
|
|
EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
|
|
VLEIB $4, $1, F_4
|
|
VLEIB $12, $1, F_4
|
|
|
|
multiply:
|
|
VAG H_0, F_0, F_0
|
|
VAG H_1, F_1, F_1
|
|
VAG H_2, F_2, F_2
|
|
VAG H_3, F_3, F_3
|
|
VAG H_4, F_4, F_4
|
|
MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
|
|
REDUCE(H_0, H_1, H_2, H_3, H_4)
|
|
CMPBNE R3, $0, loop
|
|
|
|
finish:
|
|
// sum vectors
|
|
VZERO T_0
|
|
VSUMQG H_0, T_0, H_0
|
|
VSUMQG H_1, T_0, H_1
|
|
VSUMQG H_2, T_0, H_2
|
|
VSUMQG H_3, T_0, H_3
|
|
VSUMQG H_4, T_0, H_4
|
|
|
|
// h may be >= 2*(2**130-5) so we need to reduce it again
|
|
REDUCE(H_0, H_1, H_2, H_3, H_4)
|
|
|
|
// carry h1->h4
|
|
VESRLG $26, H_1, T_1
|
|
VN MOD26, H_1, H_1
|
|
VAQ T_1, H_2, H_2
|
|
VESRLG $26, H_2, T_2
|
|
VN MOD26, H_2, H_2
|
|
VAQ T_2, H_3, H_3
|
|
VESRLG $26, H_3, T_3
|
|
VN MOD26, H_3, H_3
|
|
VAQ T_3, H_4, H_4
|
|
|
|
// h is now < 2*(2**130-5)
|
|
// pack h into h1 (hi) and h0 (lo)
|
|
PACK(H_0, H_1, H_2, H_3, H_4)
|
|
|
|
// if h > 2**130-5 then h -= 2**130-5
|
|
MOD(H_0, H_1, T_0, T_1, T_2)
|
|
|
|
// h += s
|
|
MOVD $·bswapMask<>(SB), R5
|
|
VL (R5), T_1
|
|
VL 16(R4), T_0
|
|
VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
|
|
VAQ T_0, H_0, H_0
|
|
VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
|
|
VST H_0, (R1)
|
|
|
|
RET
|
|
|
|
b2:
|
|
CMPBLE R3, $16, b1
|
|
|
|
// 2 blocks remaining
|
|
SUB $17, R3
|
|
VL (R2), T_0
|
|
VLL R3, 16(R2), T_1
|
|
ADD $1, R3
|
|
MOVBZ $1, R0
|
|
CMPBEQ R3, $16, 2(PC)
|
|
VLVGB R3, R0, T_1
|
|
EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
|
|
CMPBNE R3, $16, 2(PC)
|
|
VLEIB $12, $1, F_4
|
|
VLEIB $4, $1, F_4
|
|
|
|
// setup [r²,r]
|
|
VLVGG $1, RSAVE_0, R_0
|
|
VLVGG $1, RSAVE_1, R_1
|
|
VLVGG $1, RSAVE_2, R_2
|
|
VLVGG $1, RSAVE_3, R_3
|
|
VLVGG $1, RSAVE_4, R_4
|
|
VPDI $0, R5_1, R5SAVE_1, R5_1
|
|
VPDI $0, R5_2, R5SAVE_2, R5_2
|
|
VPDI $0, R5_3, R5SAVE_3, R5_3
|
|
VPDI $0, R5_4, R5SAVE_4, R5_4
|
|
|
|
MOVD $0, R3
|
|
BR multiply
|
|
|
|
skip:
|
|
VZERO H_0
|
|
VZERO H_1
|
|
VZERO H_2
|
|
VZERO H_3
|
|
VZERO H_4
|
|
|
|
CMPBEQ R3, $0, finish
|
|
|
|
b1:
|
|
// 1 block remaining
|
|
SUB $1, R3
|
|
VLL R3, (R2), T_0
|
|
ADD $1, R3
|
|
MOVBZ $1, R0
|
|
CMPBEQ R3, $16, 2(PC)
|
|
VLVGB R3, R0, T_0
|
|
VZERO T_1
|
|
EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
|
|
CMPBNE R3, $16, 2(PC)
|
|
VLEIB $4, $1, F_4
|
|
VLEIG $1, $1, R_0
|
|
VZERO R_1
|
|
VZERO R_2
|
|
VZERO R_3
|
|
VZERO R_4
|
|
VZERO R5_1
|
|
VZERO R5_2
|
|
VZERO R5_3
|
|
VZERO R5_4
|
|
|
|
// setup [r, 1]
|
|
VLVGG $0, RSAVE_0, R_0
|
|
VLVGG $0, RSAVE_1, R_1
|
|
VLVGG $0, RSAVE_2, R_2
|
|
VLVGG $0, RSAVE_3, R_3
|
|
VLVGG $0, RSAVE_4, R_4
|
|
VPDI $0, R5SAVE_1, R5_1, R5_1
|
|
VPDI $0, R5SAVE_2, R5_2, R5_2
|
|
VPDI $0, R5SAVE_3, R5_3, R5_3
|
|
VPDI $0, R5SAVE_4, R5_4, R5_4
|
|
|
|
MOVD $0, R3
|
|
BR multiply
|
|
|
|
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
|
|
MOVD $x-24(SP), R1
|
|
XC $24, 0(R1), 0(R1) // clear the storage
|
|
MOVD $2, R0 // R0 is the number of double words stored -1
|
|
WORD $0xB2B01000 // STFLE 0(R1)
|
|
XOR R0, R0 // reset the value of R0
|
|
MOVBZ z-8(SP), R1
|
|
AND $0x40, R1
|
|
BEQ novector
|
|
|
|
vectorinstalled:
|
|
// check if the vector instruction has been enabled
|
|
VLEIB $0, $0xF, V16
|
|
VLGVB $0, V16, R1
|
|
CMPBNE R1, $0xF, novector
|
|
MOVB $1, ret+0(FP) // have vx
|
|
RET
|
|
|
|
novector:
|
|
MOVB $0, ret+0(FP) // no vx
|
|
RET
|