sha3: optimizations and cleanup

* complate reset of the SHA-3 code. Affects mostly the code in sha3.go * fixes a bug which causes SHAKE implementation to crash * implementation of Read()/Write() avoid unnecessary buffering as much as possible * NOTE: at some point I've done separated implementation for SumXXX, functions, but after optimizing implementation of Read/Write/Sum, the gain wasn't that big Current speed on Initial speed on i7-8665U@1.90 BenchmarkPermutationFunction 1592787 736 ns/op 271.90 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x01/SHA-3/224 98752 11630 ns/op 176.02 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x01/SHA-3/256 92508 12447 ns/op 164.46 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x01/SHA-3/384 76765 15206 ns/op 134.62 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x01/SHA-3/512 54333 21932 ns/op 93.33 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x16/SHA-3/224 10000 102161 ns/op 160.37 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x16/SHA-3/256 10000 106531 ns/op 153.80 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x16/SHA-3/384 8641 137272 ns/op 119.35 MB/s 0 B/op 0 allocs/op BenchmarkSha3Chunk_x16/SHA-3/512 6340 189124 ns/op 86.63 MB/s 0 B/op 0 allocs/op BenchmarkShake_x01/SHAKE-128 167062 7149 ns/op 188.83 MB/s 0 B/op 0 allocs/op BenchmarkShake_x01/SHAKE-256 151982 7748 ns/op 174.24 MB/s 0 B/op 0 allocs/op BenchmarkShake_x16/SHAKE-128 12963 87770 ns/op 186.67 MB/s 0 B/op 0 allocs/op BenchmarkShake_x16/SHAKE-256 10000 105554 ns/op 155.22 MB/s 0 B/op 0 allocs/op BenchmarkCShake/cSHAKE-128 109148 10940 ns/op 187.11 MB/s 0 B/op 0 allocs/op BenchmarkCShake/cSHAKE-256 90324 13211 ns/op 154.94 MB/s 0 B/op 0 allocs/op PASS
2024-11-22 23:28:57 +00:00 · 2020-08-26 11:19:52 +01:00 · 2020-08-26 11:19:52 +01:00 · 45139582f3
commit 45139582f3
parent 7dcb72bf74
7 changed files with 343 additions and 310 deletions
--- a/57
+++ b/57
@ -1,13 +1,52 @@
-           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+Copyright (c) 2020 Kris Kwiatkowski, All rights reserved.
                   Version 2, December 2004
-Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
-Everyone is permitted to copy and distribute verbatim or modified
+1. Redistributions of source code must retain the above copyright notice, this
-copies of this license document, and changing it is allowed as long
+   list of conditions and the following disclaimer.
 as the name is changed.
-           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+2. Redistributions in binary form must reproduce the above copyright notice,
-  TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
- 0. You just DO WHAT THE FUCK YOU WANT TO.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIEDi
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ====
 Copyright (c) 2009 The Go Authors. All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
   * Neither the name of Google Inc. nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/hash/sha3/doc.go
+++ b/hash/sha3/doc.go
@ -1,66 +1,20 @@
 // Copyright 2020 Kris Kwiatkowski. All rights reserved.
 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// Package sha3 implements the SHA-3 fixed-output-length hash functions and
+// Package sha3 implements the Keccak-p[1600, 24] permuation.
-// the SHAKE variable-output-length hash functions defined by FIPS-202.
+// The 1600 stands for width of the permutation - number of
 // bits that are permuted at a time, and 24 stands for number
 // of rounds (iterations) of the permuation.
 // Package implementds derivatives of the Keccak permuation,
 // like SHA-3 fixed-output-length hash, SHAKE which is an
 // extendable-output-functions (XOF) and cSHAKE - a XOF with
 // domain separation.
 //
-// Both types of hash function use the "sponge" construction and the Keccak
+// The SHA-3 and SHAKE are documented in FIPS-PUB-202 [1] and
-// permutation. For a detailed specification see http://keccak.noekeon.org/
+// cSHAKE specification can be found in NIST-SP-800-185 [2].
 //
-//
+// Implementation was initially based on
-// Guidance
+// https://godoc.org/golang.org/x/crypto/sha3
 //
 // If you aren't sure what function you need, use SHAKE256 with at least 64
 // bytes of output. The SHAKE instances are faster than the SHA3 instances;
 // the latter have to allocate memory to conform to the hash.Hash interface.
 //
 // If you need a secret-key MAC (message authentication code), prepend the
 // secret key to the input, hash with SHAKE256 and read at least 32 bytes of
 // output.
 //
 //
 // Security strengths
 //
 // The SHA3-x (x equals 224, 256, 384, or 512) functions have a security
 // strength against preimage attacks of x bits. Since they only produce "x"
 // bits of output, their collision-resistance is only "x/2" bits.
 //
 // The SHAKE-256 and -128 functions have a generic security strength of 256 and
 // 128 bits against all attacks, provided that at least 2x bits of their output
 // is used.  Requesting more than 64 or 32 bytes of output, respectively, does
 // not increase the collision-resistance of the SHAKE functions.
 //
 //
 // The sponge construction
 //
 // A sponge builds a pseudo-random function from a public pseudo-random
 // permutation, by applying the permutation to a state of "rate + capacity"
 // bytes, but hiding "capacity" of the bytes.
 //
 // A sponge starts out with a zero state. To hash an input using a sponge, up
 // to "rate" bytes of the input are XORed into the sponge's state. The sponge
 // is then "full" and the permutation is applied to "empty" it. This process is
 // repeated until all the input has been "absorbed". The input is then padded.
 // The digest is "squeezed" from the sponge in the same way, except that output
 // output is copied out instead of input being XORed in.
 //
 // A sponge is parameterized by its generic security strength, which is equal
 // to half its capacity; capacity + rate is equal to the permutation's width.
 // Since the KeccakF-1600 permutation is 1600 bits (200 bytes) wide, this means
 // that the security strength of a sponge instance is equal to (1600 - bitrate) / 2.
 //
 //
 // Recommendations
 //
 // The SHAKE functions are recommended for most new uses. They can produce
 // output of arbitrary length. SHAKE256, with an output length of at least
 // 64 bytes, provides 256-bit security against all attacks.  The Keccak team
 // recommends it for most applications upgrading from SHA2-512. (NIST chose a
 // much stronger, but much slower, sponge instance for SHA3-512.)
 //
 // The SHA-3 functions are "drop-in" replacements for the SHA-2 functions.
 // They produce output of the same length, with the same security strengths
 // against all attacks. This means, in particular, that SHA3-256 only has
 // 128-bit collision resistance, because its output length is 32 bytes.
 package sha3 // import "github.com/henrydcase/nobs/hash/sha3"
--- a/hash/sha3/sha3.go
+++ b/hash/sha3/sha3.go
@ -1,256 +1,240 @@
-// Copyright 2014 The Go Authors. All rights reserved.
+// Copyright 2020 Kris Kwiatkowski. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package sha3
-import "hash"
+import (
 	"errors"
 	"hash"
 )
-// spongeDirection indicates the direction bytes are flowing through the sponge.
+type spongeDesc struct {
-type spongeDirection int
+	r    int    // rate
 	d    int    // output size of SHA-3
 	name string // human readable name of the scheme
 }
 // Id's of SHA3 instantiations
 const (
-	// spongeAbsorbing indicates that the sponge is absorbing input.
+	SHA3_224 uint8 = iota
-	spongeAbsorbing spongeDirection = iota
+	SHA3_256
-	// spongeSqueezing indicates that the sponge is being squeezed.
+	SHA3_384
-	spongeSqueezing
+	SHA3_512
 	SHAKE128
 	SHAKE256
 )
 const (
-	// maxRate is the maximum size of the internal buffer. SHAKE-256
+	// maximum value for rate used by keccak functions
 	// currently needs the largest buffer.
 	maxRate = 168
 )
 // Statically allocated error message
 var ErrWriteAfterRead = errors.New("sha3: can't write after read")
 var Sha3Desc = map[uint8]spongeDesc{
 	SHA3_224: {r: 144, d: 224 / 8, name: "SHA3-224"},
 	SHA3_256: {r: 136, d: 256 / 8, name: "SHA3-256"},
 	SHA3_384: {r: 104, d: 384 / 8, name: "SHA3-384"},
 	SHA3_512: {r: 72, d: 512 / 8, name: "SHA3-512"},
 	SHAKE128: {r: 168, d: 0, name: "SHAKE-128"},
 	SHAKE256: {r: 136, d: 0, name: "SHAKE-128"},
 }
 type state struct {
-	// Generic sponge components.
+	// Structure describing the details of hash algorithm
-	a    [25]uint64 // main state of the hash
+	desc spongeDesc
-	buf  []byte     // points into storage
+	// permuation state. 25*64 is a width of the keccak permutation used
-	rate int        // the number of bytes of state to use
+	a [25]uint64
 	// sfx is a concatenation of "domain separator" as described in FIPS-202,
 	// (section 6.1 and 6.2) with first bit of a pad10*1 (see section 5.1).
 	sfx byte
 	// Temporary data buffer
 	data storageBuf
 	// Index in the buffer. it points to the next available possition
 	// in the data buffer if isSquezing is false. In case it is true
 	// it indicates amount of unconsumed data.
 	idx int
 	// Indicates state of the sponge function. Whether it is absorbing
 	// or squezing
 	isSquezing bool
 }
-	// dsbyte contains the "domain separation" bits and the first bit of
+func min(a, b int) int {
-	// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
+	if a < b {
-	// SHA-3 and SHAKE functions by appending bitstrings to the message.
+		return a
-	// Using a little-endian bit-ordering convention, these are "01" for SHA-3
+	}
-	// and "1111" for SHAKE, or 00000010b and 00001111b, respectively. Then the
+	return b
 	// padding rule from section 5.1 is applied to pad the message to a multiple
 	// of the rate, which involves adding a "1" bit, zero or more "0" bits, and
 	// a final "1" bit. We merge the first "1" bit from the padding into dsbyte,
 	// giving 00000110b (0x06) and 00011111b (0x1f).
 	// [1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf
 	//     "Draft FIPS 202: SHA-3 Standard: Permutation-Based Hash and
 	//      Extendable-Output Functions (May 2014)"
 	dsbyte byte
 	storage storageBuf
 	// Specific to SHA-3 and SHAKE.
 	outputLen int             // the default output size in bytes
 	state     spongeDirection // whether the sponge is absorbing or squeezing
 }
 // BlockSize returns block size in bytes. Corresponds to the input
 // block size B of the HMAC
-func (d *state) BlockSize() int { return d.rate }
+func (d *state) BlockSize() int { return d.desc.r }
 // Size returns the output size of the hash function in bytes.
-func (d *state) Size() int { return d.outputLen }
+func (d *state) Size() int { return d.desc.d }
 // Reset clears the internal state by zeroing the sponge state and
-// the byte buffer, and setting Sponge.state to absorbing.
+// the byte buffer, and setting spongeState to absorbing.
 func (d *state) Reset() {
 	// Zero the permutation's state.
 	for i := range d.a {
 		d.a[i] = 0
 	}
-	d.state = spongeAbsorbing
+	for i := range d.data {
-	d.buf = d.storage.asBytes()[:0]
+		d.data[i] = 0
 	}
 	d.isSquezing = false
 	d.idx = 0
 }
-func (d *state) clone() *state {
+// Write consumes data from the user. The data may change state of the
-	ret := *d
+// hash in case caller provided at least "rate" bytes of data. The "rate" value
-	if ret.state == spongeAbsorbing {
+// for the hash is returned by the BlockSize() function. It may return an
-		ret.buf = ret.storage.asBytes()[:len(ret.buf)]
+// error if sponge state has changed to "squeezing", meaning - Write was
-	} else {
+// called after at least one call to Read() has been done.
-		ret.buf = ret.storage.asBytes()[d.rate-cap(d.buf) : d.rate]
+func (c *state) Write(in []byte) (nwrite int, err error) {
 	if c.isSquezing {
 		return 0, ErrWriteAfterRead
 	}
 	nwrite = len(in)
 	rate := c.BlockSize()
 	buf := c.data.asBytes()
 	processLen := c.idx + len(in)
 	if processLen < c.BlockSize() {
 		// not enough data to process
 		copy(buf[c.idx:], in)
 		c.idx = processLen
 		return nwrite, nil
 	}
-	return &ret
+	// process first block
 	fbLen := rate - c.idx
 	copy(buf[c.idx:], in[:fbLen])
 	xorIn(c, buf[:])
 	keccakF1600(&c.a)
 	// process remaining blocks
 	in = in[fbLen:]
 	for len(in) >= rate {
 		xorIn(c, in[:rate])
 		keccakF1600(&c.a)
 		in = in[rate:]
 	}
 	// store unprocessed data
 	copy(buf[:], in)
 	c.idx = len(in)
 	return nwrite, nil
 }
-// permute applies the KeccakF-1600 permutation. It handles
+// Read changes state of the hash if called first time. It will
-// any input-output buffering.
+// return len(out) bytes of data. Never fails.
-func (d *state) permute() {
+func (c *state) Read(out []byte) (nread int, err error) {
-	switch d.state {
+	buf := c.data.asBytes()[:]
-	case spongeAbsorbing:
+	rate := c.BlockSize()
-		// If we're absorbing, we need to xor the input into the state
+	nread = len(out)
 		// before applying the permutation.
 		xorIn(d, d.buf)
 		d.buf = d.storage.asBytes()[:0]
 		keccakF1600(&d.a)
 	case spongeSqueezing:
 		// If we're squeezing, we need to apply the permutatin before
 		// copying more output.
 		keccakF1600(&d.a)
 		d.buf = d.storage.asBytes()[:d.rate]
 		copyOut(d, d.buf)
 	}
 }
-// pads appends the domain separation bits in dsbyte, applies
+	if !c.isSquezing {
-// the multi-bitrate 10..1 padding rule, and permutes the state.
+		// there is at least one byte free, otherise
-func (d *state) padAndPermute(dsbyte byte) {
+		// buf would be squezed already
-	if d.buf == nil {
+		for i := c.idx + 1; i < rate; i++ {
-		d.buf = d.storage.asBytes()[:0]
+			buf[i] = 0
 	}
 	// Pad with this instance's domain-separator bits. We know that there's
 	// at least one byte of space in d.buf because, if it were full,
 	// permute would have been called to empty it. dsbyte also contains the
 	// first one bit for the padding. See the comment in the state struct.
 	d.buf = append(d.buf, dsbyte)
 	zerosStart := len(d.buf)
 	d.buf = d.storage.asBytes()[:d.rate]
 	for i := zerosStart; i < d.rate; i++ {
 		d.buf[i] = 0
 	}
 	// This adds the final one bit for the padding. Because of the way that
 	// bits are numbered from the LSB upwards, the final bit is the MSB of
 	// the last byte.
 	d.buf[d.rate-1] ^= 0x80
 	// Apply the permutation
 	d.permute()
 	d.state = spongeSqueezing
 	d.buf = d.storage.asBytes()[:d.rate]
 	copyOut(d, d.buf)
 }
 // Write absorbs more data into the hash's state. It produces an error
 // if more data is written to the ShakeHash after writing
 func (d *state) Write(p []byte) (written int, err error) {
 	if d.state != spongeAbsorbing {
 		panic("sha3: write to sponge after read")
 	}
 	if d.buf == nil {
 		d.buf = d.storage.asBytes()[:0]
 	}
 	written = len(p)
 	for len(p) > 0 {
 		if len(d.buf) == 0 && len(p) >= d.rate {
 			// The fast path; absorb a full "rate" bytes of input and apply the permutation.
 			xorIn(d, p[:d.rate])
 			p = p[d.rate:]
 			keccakF1600(&d.a)
 		} else {
 			// The slow path; buffer the input until we can fill the sponge, and then xor it in.
 			todo := d.rate - len(d.buf)
 			if todo > len(p) {
 				todo = len(p)
 			}
 			d.buf = append(d.buf, p[:todo]...)
 			p = p[todo:]
 			// If the sponge is full, apply the permutation.
 			if len(d.buf) == d.rate {
 				d.permute()
 			}
 		}
 		buf[c.idx] = c.sfx
 		buf[rate-1] |= 0x80
 		xorIn(c, buf[:rate])
 		keccakF1600(&c.a)
 		copyOut(c, buf[:rate])
 		c.idx = rate // now, idx indicates unconsumed amount of data
 		c.isSquezing = true
 	}
-	return
+	// Copy-out bytes that are still kept in the buffer
-}
+	if c.idx != 0 {
-
+		l := min(c.idx, len(out))
-// Read squeezes an arbitrary number of bytes from the sponge.
+		idx := rate - c.idx
-func (d *state) Read(out []byte) (n int, err error) {
+		copy(out, buf[idx:idx+l])
-	// If we're still absorbing, pad and apply the permutation.
+		out = out[l:]
-	if d.state == spongeAbsorbing {
+		c.idx -= l
 		d.padAndPermute(d.dsbyte)
 	}
-	n = len(out)
+	l := len(out)
-
+	if l == 0 {
-	// Now, do the squeezing.
+		// nothing else todo
-	for len(out) > 0 {
+		return nread, nil
 		n := copy(out, d.buf)
 		d.buf = d.buf[n:]
 		out = out[n:]
 		// Apply the permutation if we've squeezed the sponge dry.
 		if len(d.buf) == 0 {
 			d.permute()
 		}
 	}
-	return
+	// copy out full blocks and squeeze. at this point
 	// there is no more data in the buffer.
 	nblocks := l / rate
 	for i := 0; i < nblocks; i++ {
 		keccakF1600(&c.a)
 		copyOut(c, out[:rate])
 		out = out[rate:]
 	}
 	// produce more if needed
 	l = len(out)
 	if l == 0 {
 		return nread, nil
 	}
 	keccakF1600(&c.a)
 	copyOut(c, buf)
 	copy(out, buf[:l])
 	c.idx = rate - l
 	return nread, nil
 }
 // Sum applies padding to the hash state and then squeezes out the desired
 // number of output bytes.
-func (d *state) Sum(in []byte) []byte {
+func (c *state) Sum(in []byte) []byte {
-	// Make a copy of the original hash so that caller can keep writing
+	l := len(in)
-	// and summing.
+	// create buffer if nil has been provided
-	dup := d.clone()
+	if in == nil {
-	hash := make([]byte, dup.outputLen)
+		in = make([]byte, c.Size())
-	dup.Read(hash)
+	}
-	return append(in, hash...)
+
 	// enlarge capacity of the buffer if needed
 	if cap(in) < (l + c.Size()) {
 		b := make([]byte, l+c.Size()-cap(in))
 		in = append(in[:cap(in)], b...)
 	}
 	in = in[:l+c.Size()]
 	c.Read(in[l:])
 	return in
 }
 // New224 creates a new SHA3-224 hash.
 // Its generic security strength is 224 bits against preimage attacks,
 // and 112 bits against collision attacks.
 func New224() hash.Hash {
-	return &state{rate: 144, outputLen: 28, dsbyte: 0x06}
+	return &state{sfx: 0x06, desc: Sha3Desc[SHA3_224]}
 }
 // New256 creates a new SHA3-256 hash.
 // Its generic security strength is 256 bits against preimage attacks,
 // and 128 bits against collision attacks.
 func New256() hash.Hash {
-	return &state{rate: 136, outputLen: 32, dsbyte: 0x06}
+	return &state{sfx: 0x06, desc: Sha3Desc[SHA3_256]}
 }
 // New384 creates a new SHA3-384 hash.
 // Its generic security strength is 384 bits against preimage attacks,
 // and 192 bits against collision attacks.
 func New384() hash.Hash {
-	return &state{rate: 104, outputLen: 48, dsbyte: 0x06}
+	return &state{sfx: 0x06, desc: Sha3Desc[SHA3_384]}
 }
 // New512 creates a new SHA3-512 hash.
 // Its generic security strength is 512 bits against preimage attacks,
 // and 256 bits against collision attacks.
 func New512() hash.Hash {
-	return &state{rate: 72, outputLen: 64, dsbyte: 0x06}
+	return &state{sfx: 0x06, desc: Sha3Desc[SHA3_512]}
 }
 // Sum224 returns the SHA3-224 digest of the data.
 func Sum224(data []byte) (digest [28]byte) {
 	h := New224()
 	h.Write(data)
 	h.Sum(digest[:0])
 	return
 }
 // Sum256 returns the SHA3-256 digest of the data.
 func Sum256(data []byte) (digest [32]byte) {
 	h := New256()
 	h.Write(data)
 	h.Sum(digest[:0])
 	return
 }
 // Sum384 returns the SHA3-384 digest of the data.
 func Sum384(data []byte) (digest [48]byte) {
 	h := New384()
 	h.Write(data)
 	h.Sum(digest[:0])
 	return
 }
 // Sum512 returns the SHA3-512 digest of the data.
 func Sum512(data []byte) (digest [64]byte) {
 	h := New512()
 	h.Write(data)
 	h.Sum(digest[:0])
 	return
 }
--- a/hash/sha3/sha3_test.go
+++ b/hash/sha3/sha3_test.go
@ -113,7 +113,7 @@ func TestKeccakKats(t *testing.T) {
 				d.Write(in[:kat.Length/8])
 				got := strings.ToUpper(hex.EncodeToString(d.Sum(nil)))
 				if got != kat.Digest {
-					t.Errorf("function=%s, implementation=%s, length=%d\nmessage:\n %s\ngot:\n  %s\nwanted:\n %s",
+					t.Errorf("function=%s, implementation=%s, length=%d\nmessage:\n %s\ngot:\n %s\nwanted:\n %s",
 						algo, impl, kat.Length, kat.Message, got, kat.Digest)
 					t.Logf("wanted %+v", kat)
 					t.FailNow()
@ -121,7 +121,6 @@ func TestKeccakKats(t *testing.T) {
 				continue
 			}
 		}
 		for algo, v := range testShakes {
 			for _, kat := range katSet.Kats[algo] {
 				N, err := hex.DecodeString(kat.N)
@ -159,7 +158,7 @@ func TestKeccakKats(t *testing.T) {
 // small input buffers.
 func TestUnalignedWrite(t *testing.T) {
 	testUnalignedAndGeneric(t, func(impl string) {
-		buf := sequentialBytes(0x10000)
+		buf := generateData(0x10000)
 		for alg, df := range testDigests {
 			d := df()
 			d.Reset()
@ -273,8 +272,16 @@ func TestSqueezing(t *testing.T) {
 	})
 }
-// sequentialBytes produces a buffer of size consecutive bytes 0x00, 0x01, ..., used for testing.
+func doSum(h hash.Hash, data []byte) (digest []byte) {
-func sequentialBytes(size int) []byte {
+	half := int(len(data) / 2)
 	h.Write(data[:half])
 	h.Write(data[half:])
 	digest = h.Sum(data[:0])
 	return
 }
 // generateData produces a buffer of size consecutive bytes 0x00, 0x01, ..., used for testing.
 func generateData(size int) []byte {
 	result := make([]byte, size)
 	for i := range result {
 		result[i] = byte(i)
@ -289,12 +296,12 @@ func TestReset(t *testing.T) {
 	for _, v := range testShakes {
 		// Calculate hash for the first time
 		c := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr))
-		c.Write(sequentialBytes(0x100))
+		c.Write(generateData(0x100))
 		c.Read(out1)
 		// Calculate hash again
 		c.Reset()
-		c.Write(sequentialBytes(0x100))
+		c.Write(generateData(0x100))
 		c.Read(out2)
 		if !bytes.Equal(out1, out2) {
@ -306,7 +313,7 @@ func TestReset(t *testing.T) {
 func TestClone(t *testing.T) {
 	out1 := make([]byte, 16)
 	out2 := make([]byte, 16)
-	in := sequentialBytes(0x100)
+	in := generateData(0x100)
 	for _, v := range testShakes {
 		h1 := v.constructor([]byte(v.defAlgoName), []byte(v.defCustomStr))
@ -337,19 +344,22 @@ func BenchmarkPermutationFunction(b *testing.B) {
 }
 // benchmarkHash tests the speed to hash num buffers of buflen each.
-func benchmarkHash(b *testing.B, h hash.Hash, size, num int) {
+// This function uses heap
 func benchmarkHashChunked(b *testing.B, h hash.Hash, size, num int) {
 	b.StopTimer()
-	h.Reset()
+	data := generateData(size)
-	data := sequentialBytes(size)
+	digestBuf := make([]byte, h.Size())
 	b.SetBytes(int64(size * num))
 	b.StartTimer()
 	var state []byte
 	for i := 0; i < b.N; i++ {
 		h.Reset()
 		for j := 0; j < num; j++ {
 			h.Write(data)
 		}
-		state = h.Sum(state[:0])
+		digestBuf = h.Sum(digestBuf[:])
 		// needed to avoid alocations
 		digestBuf = digestBuf[:0]
 	}
 	b.StopTimer()
 	h.Reset()
@ -359,9 +369,8 @@ func benchmarkHash(b *testing.B, h hash.Hash, size, num int) {
 // require a copy on reading output.
 func benchmarkShake(b *testing.B, h ShakeHash, size, num int) {
 	b.StopTimer()
-	h.Reset()
+	out := make([]byte, 32)
-	data := sequentialBytes(size)
+	data := generateData(size)
 	d := make([]byte, 32)
 	b.SetBytes(int64(size * num))
 	b.StartTimer()
@ -371,21 +380,61 @@ func benchmarkShake(b *testing.B, h ShakeHash, size, num int) {
 		for j := 0; j < num; j++ {
 			h.Write(data)
 		}
-		h.Read(d)
+		h.Read(out[:])
 	}
 }
-func BenchmarkSha3_512_MTU(b *testing.B) { benchmarkHash(b, New512(), 1350, 1) }
+var domainString = []byte("SHAKE")
-func BenchmarkSha3_384_MTU(b *testing.B) { benchmarkHash(b, New384(), 1350, 1) }
+var customString = []byte("CustomString")
 func BenchmarkSha3_256_MTU(b *testing.B) { benchmarkHash(b, New256(), 1350, 1) }
 func BenchmarkSha3_224_MTU(b *testing.B) { benchmarkHash(b, New224(), 1350, 1) }
-func BenchmarkShake128_MTU(b *testing.B)  { benchmarkShake(b, NewShake128(), 1350, 1) }
+// benchmarkShake is specialized to the Shake instances, which don't
-func BenchmarkShake256_MTU(b *testing.B)  { benchmarkShake(b, NewShake256(), 1350, 1) }
+// require a copy on reading output.
-func BenchmarkShake256_16x(b *testing.B)  { benchmarkShake(b, NewShake256(), 16, 1024) }
+func benchmarkCShake(b *testing.B, f func(N, S []byte) ShakeHash, size, num int) {
-func BenchmarkShake256_1MiB(b *testing.B) { benchmarkShake(b, NewShake256(), 1024, 1024) }
+	b.StopTimer()
 	h := f(domainString, customString)
 	out := make([]byte, 32)
 	data := generateData(size)
-func BenchmarkSha3_512_1MiB(b *testing.B) { benchmarkHash(b, New512(), 1024, 1024) }
+	b.SetBytes(int64(size * num))
 	b.StartTimer()
 	for i := 0; i < b.N; i++ {
 		h.Reset()
 		for j := 0; j < num; j++ {
 			h.Write(data)
 		}
 		h.Read(out[:])
 	}
 }
 func BenchmarkSha3Chunk_x01(b *testing.B) {
 	b.Run("SHA-3/224", func(b *testing.B) { benchmarkHashChunked(b, New224(), 2047, 1) })
 	b.Run("SHA-3/256", func(b *testing.B) { benchmarkHashChunked(b, New256(), 2047, 1) })
 	b.Run("SHA-3/384", func(b *testing.B) { benchmarkHashChunked(b, New384(), 2047, 1) })
 	b.Run("SHA-3/512", func(b *testing.B) { benchmarkHashChunked(b, New512(), 2047, 1) })
 }
 func BenchmarkSha3Chunk_x16(b *testing.B) {
 	b.Run("SHA-3/224", func(b *testing.B) { benchmarkHashChunked(b, New224(), 16, 1024) })
 	b.Run("SHA-3/256", func(b *testing.B) { benchmarkHashChunked(b, New256(), 16, 1024) })
 	b.Run("SHA-3/384", func(b *testing.B) { benchmarkHashChunked(b, New384(), 16, 1024) })
 	b.Run("SHA-3/512", func(b *testing.B) { benchmarkHashChunked(b, New512(), 16, 1024) })
 }
 func BenchmarkShake_x01(b *testing.B) {
 	b.Run("SHAKE-128", func(b *testing.B) { benchmarkShake(b, NewShake128(), 1350, 1) })
 	b.Run("SHAKE-256", func(b *testing.B) { benchmarkShake(b, NewShake256(), 1350, 1) })
 }
 func BenchmarkShake_x16(b *testing.B) {
 	b.Run("SHAKE-128", func(b *testing.B) { benchmarkShake(b, NewShake128(), 16, 1024) })
 	b.Run("SHAKE-256", func(b *testing.B) { benchmarkShake(b, NewShake256(), 16, 1024) })
 }
 func BenchmarkCShake(b *testing.B) {
 	b.Run("cSHAKE-128", func(b *testing.B) { benchmarkCShake(b, NewCShake128, 2047, 1) })
 	b.Run("cSHAKE-256", func(b *testing.B) { benchmarkCShake(b, NewCShake256, 2047, 1) })
 }
 func Example_sum() {
 	buf := []byte("some data to hash")
@ -446,3 +495,14 @@ func ExampleCShake256() {
 	//a90a4c6ca9af2156eba43dc8398279e6b60dcd56fb21837afe6c308fd4ceb05b9dd98c6ee866ca7dc5a39d53e960f400bcd5a19c8a2d6ec6459f63696543a0d8
 	//85e73a72228d08b46515553ca3a29d47df3047e5d84b12d6c2c63e579f4fd1105716b7838e92e981863907f434bfd4443c9e56ea09da998d2f9b47db71988109
 }
 func ExampleSum256() {
 	d := generateData(32)
 	var data [32]byte
 	h := New256()
 	h.Write(d)
 	s1 := h.Sum(data[:0])
 	fmt.Printf("%X\n", s1)
 	//Output:
 	// 050A48733BD5C2756BA95C5828CC83EE16FABCD3C086885B7744F84A0F9E0D94
 }
--- a/hash/sha3/shake.go
+++ b/hash/sha3/shake.go
@ -6,14 +6,6 @@ package sha3
 // SHAKE128 and SHAKE256 are FIPS approved XOFs. The cSHAKE128/256
 // are SHAKE-based XOFs supporting domain separation.
 //
 //
 // SHAKE implementation is based on FIPS PUB 202 [1]
 // cSHAKE implementations is based on NIST SP 800-185 [2]
 //
 // [1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
 // [2] https://doi.org/10.6028/NIST.SP.800-185
 import (
 	"encoding/binary"
 	"io"
@ -44,18 +36,17 @@ type cshakeState struct {
 	// initBlock is the cSHAKE specific initialization set of bytes. It is initialized
 	// by newCShake function and stores concatenation of N followed by S, encoded
-	// by the method specified in 3.3 of [1].
+	// by the method specified in 3.3 of [1] and padded with bytepad function.
-	// It is stored here in order for Reset() to be able to put context into
+	// Used by Reset() to restore initial state.
 	// initial state.
 	initBlock []byte
 }
 // Consts for configuring initial SHA-3 state
 const (
-	dsbyteShake  = 0x1f
+	sfxShake  = 0x1f
-	dsbyteCShake = 0x04
+	sfxCShake = 0x04
-	rate128      = 168
+	rate128   = 168
-	rate256      = 136
+	rate256   = 136
 )
 func bytepad(input []byte, w int) []byte {
@ -80,49 +71,51 @@ func leftEncode(value uint64) []byte {
 	return b[i-1:]
 }
-func newCShake(N, S []byte, rate int, dsbyte byte) ShakeHash {
+func newCShake(N, S []byte, sfx byte, shaId uint8) ShakeHash {
-	c := cshakeState{state: state{rate: rate, dsbyte: dsbyte}}
+	c := cshakeState{state: state{sfx: sfx, desc: Sha3Desc[shaId]}}
 	// leftEncode returns max 9 bytes
-	c.initBlock = make([]byte, 0, 9*2+len(N)+len(S))
+	b := make([]byte, 0, 9*2+len(N)+len(S))
-	c.initBlock = append(c.initBlock, leftEncode(uint64(len(N)*8))...)
+	b = append(b, leftEncode(uint64(len(N)*8))...)
-	c.initBlock = append(c.initBlock, N...)
+	b = append(b, N...)
-	c.initBlock = append(c.initBlock, leftEncode(uint64(len(S)*8))...)
+	b = append(b, leftEncode(uint64(len(S)*8))...)
-	c.initBlock = append(c.initBlock, S...)
+	b = append(b, S...)
-	c.Write(bytepad(c.initBlock, c.rate))
+	c.initBlock = bytepad(b, c.BlockSize())
 	c.Write(c.initBlock)
 	return &c
 }
 // Reset resets the hash to initial state.
 func (c *cshakeState) Reset() {
 	c.state.Reset()
-	c.Write(bytepad(c.initBlock, c.rate))
+	c.Write(c.initBlock)
 }
 // Clone returns copy of a cSHAKE context within its current state.
 func (c *cshakeState) Clone() ShakeHash {
 	b := make([]byte, len(c.initBlock))
 	copy(b, c.initBlock)
-	return &cshakeState{state: *c.clone(), initBlock: b}
+	return &cshakeState{state: c.state, initBlock: b}
 }
 // Clone returns copy of SHAKE context within its current state.
 func (c *state) Clone() ShakeHash {
-	return c.clone()
+	dup := *c
 	return &dup
 }
 // NewShake128 creates a new SHAKE128 variable-output-length ShakeHash.
 // Its generic security strength is 128 bits against all attacks if at
 // least 32 bytes of its output are used.
 func NewShake128() ShakeHash {
-	return &state{rate: rate128, dsbyte: dsbyteShake}
+	return &state{sfx: sfxShake, desc: Sha3Desc[SHAKE128]}
 }
 // NewShake256 creates a new SHAKE256 variable-output-length ShakeHash.
 // Its generic security strength is 256 bits against all attacks if
 // at least 64 bytes of its output are used.
 func NewShake256() ShakeHash {
-	return &state{rate: rate256, dsbyte: dsbyteShake}
+	return &state{sfx: sfxShake, desc: Sha3Desc[SHAKE256]}
 }
 // NewCShake128 creates a new instance of cSHAKE128 variable-output-length ShakeHash,
@ -135,7 +128,7 @@ func NewCShake128(N, S []byte) ShakeHash {
 	if len(N) == 0 && len(S) == 0 {
 		return NewShake128()
 	}
-	return newCShake(N, S, rate128, dsbyteCShake)
+	return newCShake(N, S, sfxCShake, SHAKE128)
 }
 // NewCShake256 creates a new instance of cSHAKE256 variable-output-length ShakeHash,
@ -148,7 +141,7 @@ func NewCShake256(N, S []byte) ShakeHash {
 	if len(N) == 0 && len(S) == 0 {
 		return NewShake256()
 	}
-	return newCShake(N, S, rate256, dsbyteCShake)
+	return newCShake(N, S, sfxCShake, SHAKE256)
 }
 // ShakeSum128 writes an arbitrary-length digest of data into hash.
--- a/hash/sha3/xor_unaligned.go
+++ b/hash/sha3/xor_unaligned.go
@ -7,7 +7,9 @@
 package sha3
-import "unsafe"
+import (
 	"unsafe"
 )
 // A storageBuf is an aligned array of maxRate bytes.
 type storageBuf [maxRate / 8]uint64
@ -57,6 +59,7 @@ func copyOutUnaligned(d *state, buf []byte) {
 	copy(buf, ab[:])
 }
 // TODO: remove this assignment
 var (
 	xorIn   = xorInUnaligned
 	copyOut = copyOutUnaligned
--- a/hash/sm3/sm3.go
+++ b/hash/sm3/sm3.go
@ -76,7 +76,7 @@ func (d *digest) Write(input []byte) (nn int, err error) {
 	// this eventually could be done in d.compress
 	copy(d.b[:], input[nblocks*d.BlockSize():])
-	return
+	return len(input), nil
 }
 func (d *digest) Sum(in []byte) []byte {