From c17aa50fbd32393e5d52fa65ca51cbfff0a75aea Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 29 Sep 2024 23:27:16 +0200 Subject: [PATCH] sha3: avoid buffer copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the package worked by copying the input (or the output) into a buffer, and then XOR'ing (or copying) it into (or out of) the state. (Except for an input fast path.) There's no need for that! We can XOR straight into the state, and copy straight out of it, at least on little endian machines. This is a bit faster, almost halves the state size, and will make it easier to implement marshaling, but most importantly look at how much simpler it makes the code! go: go1.23.0 goos: linux goarch: amd64 pkg: golang.org/x/crypto/sha3 cpu: AMD Ryzen 7 PRO 8700GE w/ Radeon 780M Graphics │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ sec/op │ sec/op vs base │ PermutationFunction-8 270.8n ± 0% 270.4n ± 0% ~ (p=0.099 n=10) Sha3_512_MTU-8 5.762µ ± 0% 5.658µ ± 0% -1.80% (p=0.000 n=10) Sha3_384_MTU-8 4.179µ ± 0% 4.070µ ± 0% -2.60% (p=0.000 n=10) Sha3_256_MTU-8 3.316µ ± 0% 3.214µ ± 0% -3.08% (p=0.000 n=10) Sha3_224_MTU-8 3.175µ ± 0% 3.061µ ± 0% -3.61% (p=0.000 n=10) Shake128_MTU-8 2.779µ ± 0% 2.681µ ± 0% -3.51% (p=0.000 n=10) Shake256_MTU-8 2.947µ ± 0% 2.957µ ± 0% +0.32% (p=0.000 n=10) Shake256_16x-8 44.15µ ± 0% 44.45µ ± 0% +0.67% (p=0.000 n=10) Shake256_1MiB-8 2.319m ± 0% 2.274m ± 0% -1.93% (p=0.000 n=10) Sha3_512_1MiB-8 4.204m ± 0% 4.219m ± 0% +0.34% (p=0.000 n=10) geomean 13.75µ 13.54µ -1.55% │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ B/s │ B/s vs base │ PermutationFunction-8 704.3Mi ± 0% 705.4Mi ± 0% ~ (p=0.105 n=10) Sha3_512_MTU-8 223.5Mi ± 0% 227.6Mi ± 0% +1.83% (p=0.000 n=10) Sha3_384_MTU-8 308.1Mi ± 0% 316.4Mi ± 0% +2.67% (p=0.000 n=10) Sha3_256_MTU-8 388.2Mi ± 0% 400.5Mi ± 0% +3.17% (p=0.000 n=10) Sha3_224_MTU-8 405.5Mi ± 0% 420.7Mi ± 0% +3.73% (p=0.000 n=10) Shake128_MTU-8 463.4Mi ± 0% 480.2Mi ± 0% +3.64% (p=0.000 n=10) Shake256_MTU-8 436.9Mi ± 0% 435.5Mi ± 0% -0.32% (p=0.000 n=10) Shake256_16x-8 353.9Mi ± 0% 351.5Mi ± 0% -0.66% (p=0.000 n=10) Shake256_1MiB-8 431.2Mi ± 0% 439.7Mi ± 0% +1.97% (p=0.000 n=10) Sha3_512_1MiB-8 237.8Mi ± 0% 237.1Mi ± 0% -0.33% (p=0.000 n=10) geomean 375.7Mi 381.6Mi +1.57% Even stronger effect when patched on top of CL 616555 (forced on). go: go1.23.0 goos: darwin goarch: arm64 pkg: golang.org/x/crypto/sha3 cpu: Apple M2 │ old │ new │ │ sec/op │ sec/op vs base │ PermutationFunction-8 154.7n ± 2% 153.8n ± 1% ~ (p=0.469 n=10) Sha3_512_MTU-8 3.260µ ± 2% 3.143µ ± 2% -3.60% (p=0.000 n=10) Sha3_384_MTU-8 2.389µ ± 2% 2.244µ ± 2% -6.07% (p=0.000 n=10) Sha3_256_MTU-8 1.950µ ± 2% 1.758µ ± 1% -9.87% (p=0.000 n=10) Sha3_224_MTU-8 1.874µ ± 2% 1.686µ ± 1% -10.06% (p=0.000 n=10) Shake128_MTU-8 1.827µ ± 3% 1.447µ ± 1% -20.80% (p=0.000 n=10) Shake256_MTU-8 1.665µ ± 3% 1.604µ ± 3% -3.63% (p=0.003 n=10) Shake256_16x-8 25.14µ ± 1% 25.23µ ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 1.236m ± 2% 1.243m ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 2.296m ± 2% 2.305m ± 1% ~ (p=0.315 n=10) geomean 7.906µ 7.467µ -5.56% │ old │ new │ │ B/op │ B/op vs base │ PermutationFunction-8 1.204Gi ± 2% 1.212Gi ± 1% ~ (p=0.529 n=10) Sha3_512_MTU-8 394.9Mi ± 2% 409.7Mi ± 2% +3.73% (p=0.000 n=10) Sha3_384_MTU-8 539.0Mi ± 2% 573.8Mi ± 2% +6.45% (p=0.000 n=10) Sha3_256_MTU-8 660.3Mi ± 2% 732.6Mi ± 1% +10.95% (p=0.000 n=10) Sha3_224_MTU-8 687.1Mi ± 2% 763.9Mi ± 1% +11.17% (p=0.000 n=10) Shake128_MTU-8 704.7Mi ± 2% 889.6Mi ± 2% +26.24% (p=0.000 n=10) Shake256_MTU-8 773.4Mi ± 3% 802.5Mi ± 3% +3.76% (p=0.004 n=10) Shake256_16x-8 621.6Mi ± 1% 619.3Mi ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 809.1Mi ± 2% 804.7Mi ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 435.6Mi ± 2% 433.9Mi ± 1% ~ (p=0.315 n=10) geomean 653.6Mi 692.0Mi +5.88% Change-Id: I33a0a1ddf305c395f99bf17f81473e2f42c5ce42 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616575 Reviewed-by: Daniel McCarney Reviewed-by: Michael Pratt Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Auto-Submit: Filippo Valsorda Reviewed-by: Andrew Ekstedt --- sha3/sha3.go | 113 +++++++++++++++++++++++---------------------------- sha3/xor.go | 40 ------------------ 2 files changed, 50 insertions(+), 103 deletions(-) delete mode 100644 sha3/xor.go diff --git a/sha3/sha3.go b/sha3/sha3.go index afedde5abf..bda574e9d6 100644 --- a/sha3/sha3.go +++ b/sha3/sha3.go @@ -4,6 +4,14 @@ package sha3 +import ( + "crypto/subtle" + "encoding/binary" + "unsafe" + + "golang.org/x/sys/cpu" +) + // spongeDirection indicates the direction bytes are flowing through the sponge. type spongeDirection int @@ -14,16 +22,13 @@ const ( spongeSqueezing ) -const ( - // maxRate is the maximum size of the internal buffer. SHAKE-256 - // currently needs the largest buffer. - maxRate = 168 -) - type state struct { - // Generic sponge components. - a [25]uint64 // main state of the hash - rate int // the number of bytes of state to use + a [1600 / 8]byte // main state of the hash + + // a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR + // into before running the permutation. If squeezing, it's the remaining + // output to produce before running the permutation. + n, rate int // dsbyte contains the "domain separation" bits and the first bit of // the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the @@ -39,10 +44,6 @@ type state struct { // Extendable-Output Functions (May 2014)" dsbyte byte - i, n int // storage[i:n] is the buffer, i is only used while squeezing - storage [maxRate]byte - - // Specific to SHA-3 and SHAKE. outputLen int // the default output size in bytes state spongeDirection // whether the sponge is absorbing or squeezing } @@ -61,7 +62,7 @@ func (d *state) Reset() { d.a[i] = 0 } d.state = spongeAbsorbing - d.i, d.n = 0, 0 + d.n = 0 } func (d *state) clone() *state { @@ -69,22 +70,25 @@ func (d *state) clone() *state { return &ret } -// permute applies the KeccakF-1600 permutation. It handles -// any input-output buffering. +// permute applies the KeccakF-1600 permutation. func (d *state) permute() { - switch d.state { - case spongeAbsorbing: - // If we're absorbing, we need to xor the input into the state - // before applying the permutation. - xorIn(d, d.storage[:d.rate]) - d.n = 0 - keccakF1600(&d.a) - case spongeSqueezing: - // If we're squeezing, we need to apply the permutation before - // copying more output. - keccakF1600(&d.a) - d.i = 0 - copyOut(d, d.storage[:d.rate]) + var a *[25]uint64 + if cpu.IsBigEndian { + a = new([25]uint64) + for i := range a { + a[i] = binary.LittleEndian.Uint64(d.a[i*8:]) + } + } else { + a = (*[25]uint64)(unsafe.Pointer(&d.a)) + } + + keccakF1600(a) + d.n = 0 + + if cpu.IsBigEndian { + for i := range a { + binary.LittleEndian.PutUint64(d.a[i*8:], a[i]) + } } } @@ -92,53 +96,36 @@ func (d *state) permute() { // the multi-bitrate 10..1 padding rule, and permutes the state. func (d *state) padAndPermute() { // Pad with this instance's domain-separator bits. We know that there's - // at least one byte of space in d.buf because, if it were full, + // at least one byte of space in the sponge because, if it were full, // permute would have been called to empty it. dsbyte also contains the // first one bit for the padding. See the comment in the state struct. - d.storage[d.n] = d.dsbyte - d.n++ - for d.n < d.rate { - d.storage[d.n] = 0 - d.n++ - } + d.a[d.n] ^= d.dsbyte // This adds the final one bit for the padding. Because of the way that // bits are numbered from the LSB upwards, the final bit is the MSB of // the last byte. - d.storage[d.rate-1] ^= 0x80 + d.a[d.rate-1] ^= 0x80 // Apply the permutation d.permute() d.state = spongeSqueezing - d.n = d.rate - copyOut(d, d.storage[:d.rate]) } // Write absorbs more data into the hash's state. It panics if any // output has already been read. -func (d *state) Write(p []byte) (written int, err error) { +func (d *state) Write(p []byte) (n int, err error) { if d.state != spongeAbsorbing { panic("sha3: Write after Read") } - written = len(p) + + n = len(p) for len(p) > 0 { - if d.n == 0 && len(p) >= d.rate { - // The fast path; absorb a full "rate" bytes of input and apply the permutation. - xorIn(d, p[:d.rate]) - p = p[d.rate:] - keccakF1600(&d.a) - } else { - // The slow path; buffer the input until we can fill the sponge, and then xor it in. - todo := d.rate - d.n - if todo > len(p) { - todo = len(p) - } - d.n += copy(d.storage[d.n:], p[:todo]) - p = p[todo:] - - // If the sponge is full, apply the permutation. - if d.n == d.rate { - d.permute() - } + x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p) + d.n += x + p = p[x:] + + // If the sponge is full, apply the permutation. + if d.n == d.rate { + d.permute() } } @@ -156,12 +143,12 @@ func (d *state) Read(out []byte) (n int, err error) { // Now, do the squeezing. for len(out) > 0 { - n := copy(out, d.storage[d.i:d.n]) - d.i += n - out = out[n:] + x := copy(out, d.a[d.n:d.rate]) + d.n += x + out = out[x:] // Apply the permutation if we've squeezed the sponge dry. - if d.i == d.rate { + if d.n == d.rate { d.permute() } } diff --git a/sha3/xor.go b/sha3/xor.go deleted file mode 100644 index 6ada5c9574..0000000000 --- a/sha3/xor.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package sha3 - -import ( - "crypto/subtle" - "encoding/binary" - "unsafe" - - "golang.org/x/sys/cpu" -) - -// xorIn xors the bytes in buf into the state. -func xorIn(d *state, buf []byte) { - if cpu.IsBigEndian { - for i := 0; len(buf) >= 8; i++ { - a := binary.LittleEndian.Uint64(buf) - d.a[i] ^= a - buf = buf[8:] - } - } else { - ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) - subtle.XORBytes(ab[:], ab[:], buf) - } -} - -// copyOut copies uint64s to a byte buffer. -func copyOut(d *state, b []byte) { - if cpu.IsBigEndian { - for i := 0; len(b) >= 8; i++ { - binary.LittleEndian.PutUint64(b, d.a[i]) - b = b[8:] - } - } else { - ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a)) - copy(b, ab[:]) - } -}