From 440a2c5b87658f44d5825465499ee7cbd70c76b0 Mon Sep 17 00:00:00 2001 From: Brant Burnett Date: Fri, 12 Jul 2024 16:05:25 -0400 Subject: [PATCH] Implement a faster encoding algoritm for varint In .NET 6 and later we can use intrinsics, especially on Intel, to encode the length as a varint more quickly. This isn't a huge difference given the overall cost of compression but may still be worthwhile. BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3880/23H2/2023Update/SunValley3) 12th Gen Intel Core i7-1270P, 1 CPU, 16 logical and 12 physical cores .NET SDK 8.0.303 [Host] : .NET 8.0.7 (8.0.724.31311), X64 RyuJIT AVX2 DefaultJob : .NET 8.0.7 (8.0.724.31311), X64 RyuJIT AVX2 | Method | Value | Mean | Error | StdDev | Ratio | Rank | |-------- |------ |---------:|----------:|----------:|------:|-----:| | Current | 0 | 1.385 ns | 0.0176 ns | 0.0164 ns | 1.00 | 2 | | New | 0 | 1.091 ns | 0.0200 ns | 0.0188 ns | 0.79 | 1 | | | | | | | | | | Current | 256 | 1.599 ns | 0.0282 ns | 0.0220 ns | 1.00 | 2 | | New | 256 | 1.340 ns | 0.0104 ns | 0.0081 ns | 0.84 | 1 | | | | | | | | | | Current | 65536 | 1.593 ns | 0.0132 ns | 0.0117 ns | 1.00 | 2 | | New | 65536 | 1.367 ns | 0.0181 ns | 0.0169 ns | 0.86 | 1 | --- AUTHORS | 1 + .../Snappier.Benchmarks.csproj | 2 +- Snappier.Benchmarks/VarIntEncodingWrite.cs | 23 +++ .../Internal/VarIntEncodingWriteTests.cs | 69 +++++++++ Snappier.Tests/Snappier.Tests.csproj | 8 +- Snappier/Internal/SnappyCompressor.cs | 46 +----- Snappier/Internal/VarIntEncoding.Write.cs | 51 +++++++ Snappier/Internal/VarIntEncoding.WriteFast.cs | 138 ++++++++++++++++++ 8 files changed, 288 insertions(+), 50 deletions(-) create mode 100644 Snappier.Benchmarks/VarIntEncodingWrite.cs create mode 100644 Snappier.Tests/Internal/VarIntEncodingWriteTests.cs create mode 100644 Snappier/Internal/VarIntEncoding.Write.cs create mode 100644 Snappier/Internal/VarIntEncoding.WriteFast.cs diff --git a/AUTHORS b/AUTHORS index 8936cb7..7b4d141 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,2 +1,3 @@ opensource@google.com bburnett@centeredgesoftware.com +info@couchbase.com diff --git a/Snappier.Benchmarks/Snappier.Benchmarks.csproj b/Snappier.Benchmarks/Snappier.Benchmarks.csproj index 4e1fb96..48271ec 100644 --- a/Snappier.Benchmarks/Snappier.Benchmarks.csproj +++ b/Snappier.Benchmarks/Snappier.Benchmarks.csproj @@ -33,7 +33,7 @@ - + diff --git a/Snappier.Benchmarks/VarIntEncodingWrite.cs b/Snappier.Benchmarks/VarIntEncodingWrite.cs new file mode 100644 index 0000000..df410b5 --- /dev/null +++ b/Snappier.Benchmarks/VarIntEncodingWrite.cs @@ -0,0 +1,23 @@ +#if !PREVIOUS + +using BenchmarkDotNet.Attributes; +using Snappier.Internal; + +namespace Snappier.Benchmarks +{ + public class VarIntEncodingWrite + { + [Params(0u, 256u, 65536u)] + public uint Value { get; set; } + + readonly byte[] _dest = new byte[8]; + + [Benchmark(Baseline = true)] + public int Baseline() + { + return VarIntEncoding.Write(_dest, Value); + } + } +} + +#endif diff --git a/Snappier.Tests/Internal/VarIntEncodingWriteTests.cs b/Snappier.Tests/Internal/VarIntEncodingWriteTests.cs new file mode 100644 index 0000000..cbe9aa9 --- /dev/null +++ b/Snappier.Tests/Internal/VarIntEncodingWriteTests.cs @@ -0,0 +1,69 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Snappier.Internal; +using Xunit; + +namespace Snappier.Tests.Internal +{ + public class VarIntEncodingWriteTests + { + public static TheoryData TestData() => + new() { + { 0x00, [ 0x00 ] }, + { 0x01, [ 0x01 ] }, + { 0x7F, [ 0x7F ] }, + { 0x80, [ 0x80, 0x01 ] }, + { 0x555, [ 0xD5, 0x0A ] }, + { 0x7FFF, [ 0xFF, 0xFF, 0x01 ] }, + { 0xBFFF, [ 0xFF, 0xFF, 0x02 ] }, + { 0xFFFF, [ 0XFF, 0xFF, 0x03 ] }, + { 0x8000, [ 0x80, 0x80, 0x02 ] }, + { 0x5555, [ 0xD5, 0xAA, 0x01 ] }, + { 0xCAFEF00, [ 0x80, 0xDE, 0xBF, 0x65 ] }, + { 0xCAFEF00D, [ 0x8D, 0xE0, 0xFB, 0xD7, 0x0C ] }, + { 0xFFFFFFFF, [ 0xFF, 0xFF, 0xFF, 0xFF, 0x0F ] }, + }; + + [Theory] + [MemberData(nameof(TestData))] + public void Test_Write(uint value, byte[] expected) + { + var bytes = new byte[5]; + + var length = VarIntEncoding.Write(bytes, value); + Assert.Equal(expected, bytes.Take(length)); + } + + [Theory] + [MemberData(nameof(TestData))] + public void Test_WriteWithPadding(uint value, byte[] expected) + { + // Test of the fast path where there are at least 8 bytes in the buffer + + var bytes = new byte[sizeof(ulong)]; + + var length = VarIntEncoding.Write(bytes, value); + Assert.Equal(expected, bytes.Take(length)); + } + } +} + +/* ************************************************************ + * + * @author Couchbase + * @copyright 2021 Couchbase, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * ************************************************************/ diff --git a/Snappier.Tests/Snappier.Tests.csproj b/Snappier.Tests/Snappier.Tests.csproj index 5331a4c..4b19754 100644 --- a/Snappier.Tests/Snappier.Tests.csproj +++ b/Snappier.Tests/Snappier.Tests.csproj @@ -20,13 +20,13 @@ - - - + + + all runtime; build; native; contentfiles; analyzers; buildtransitive - + all runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/Snappier/Internal/SnappyCompressor.cs b/Snappier/Internal/SnappyCompressor.cs index 7a8217b..5281dd9 100644 --- a/Snappier/Internal/SnappyCompressor.cs +++ b/Snappier/Internal/SnappyCompressor.cs @@ -18,7 +18,7 @@ public int Compress(ReadOnlySpan input, Span output) _workingMemory.EnsureCapacity(input.Length); - int bytesWritten = WriteUncompressedLength(output, input.Length); + int bytesWritten = VarIntEncoding.Write(output, (uint)input.Length); output = output.Slice(bytesWritten); while (input.Length > 0) @@ -69,50 +69,6 @@ public void Dispose() _workingMemory = null; } - private static int WriteUncompressedLength(Span output, int length) - { - const int b = 0b1000_0000; - - unchecked - { - if (length < (1 << 7)) - { - output[0] = (byte) length; - return 1; - } - else if (length < (1 << 14)) - { - output[0] = (byte) (length | b); - output[1] = (byte) (length >> 7); - return 2; - } - else if (length < (1 << 21)) - { - output[0] = (byte) (length | b); - output[1] = (byte) ((length >> 7) | b); - output[2] = (byte) (length >> 14); - return 3; - } - else if (length < (1 << 28)) - { - output[0] = (byte) (length | b); - output[1] = (byte) ((length >> 7) | b); - output[2] = (byte) ((length >> 14) | b); - output[3] = (byte) (length >> 21); - return 4; - } - else - { - output[0] = (byte) (length | b); - output[1] = (byte) ((length >> 7) | b); - output[2] = (byte) ((length >> 14) | b); - output[3] = (byte) ((length >> 21) | b); - output[4] = (byte) (length >> 28); - return 5; - } - } - } - #region CompressFragment private static int CompressFragment(ReadOnlySpan input, Span output, Span tableSpan) diff --git a/Snappier/Internal/VarIntEncoding.Write.cs b/Snappier/Internal/VarIntEncoding.Write.cs new file mode 100644 index 0000000..b6e9517 --- /dev/null +++ b/Snappier/Internal/VarIntEncoding.Write.cs @@ -0,0 +1,51 @@ +using System; + +namespace Snappier.Internal +{ + internal static partial class VarIntEncoding + { + private static int WriteSlow(Span output, uint length) + { + const int b = 0b1000_0000; + + unchecked + { + if (length < (1 << 7)) + { + output[0] = (byte) length; + return 1; + } + else if (length < (1 << 14)) + { + output[0] = (byte) (length | b); + output[1] = (byte) (length >> 7); + return 2; + } + else if (length < (1 << 21)) + { + output[0] = (byte) (length | b); + output[1] = (byte) ((length >> 7) | b); + output[2] = (byte) (length >> 14); + return 3; + } + else if (length < (1 << 28)) + { + output[0] = (byte) (length | b); + output[1] = (byte) ((length >> 7) | b); + output[2] = (byte) ((length >> 14) | b); + output[3] = (byte) (length >> 21); + return 4; + } + else + { + output[0] = (byte) (length | b); + output[1] = (byte) ((length >> 7) | b); + output[2] = (byte) ((length >> 14) | b); + output[3] = (byte) ((length >> 21) | b); + output[4] = (byte) (length >> 28); + return 5; + } + } + } + } +} diff --git a/Snappier/Internal/VarIntEncoding.WriteFast.cs b/Snappier/Internal/VarIntEncoding.WriteFast.cs new file mode 100644 index 0000000..e2cb79e --- /dev/null +++ b/Snappier/Internal/VarIntEncoding.WriteFast.cs @@ -0,0 +1,138 @@ +using System; + +#if NET6_0_OR_GREATER +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +#endif + +/* + * This file is ported from https://github.com/couchbase/couchbase-net-client/blob/c10fe9ef09beadb8512f696d764b7a770429e641/src/Couchbase/Core/Utils/Leb128.cs + * and therefore retains a Couchbase copyright. + **/ + +namespace Snappier.Internal +{ + internal static partial class VarIntEncoding + { + /// + /// Maximum length, in bytes, when encoding a 32-bit integer. + /// + public const int MaxLength = 5; + + /// + /// Encodes a value onto a buffer using little-ending varint encoding. + /// + /// Buffer to receive the value. + /// Value to encode. + /// Number of bytes encoded. + public static int Write(Span buffer, uint value) + { + // Note: This method is likely to be inlined into the caller, potentially + // eliding the size check if JIT knows the size of the buffer. BitConverter.IsLittleEndian + // will always be elided based on CPU architecture. + +#if NET6_0_OR_GREATER + if (BitConverter.IsLittleEndian && buffer.Length >= sizeof(ulong)) + { + // Only use the fast path on little-endian CPUs and when there's enough padding in the + // buffer to write an ulong. At most there will be 5 real bytes written, but for speed + // up to 8 bytes are being copied to the buffer from a register. This guard prevents a + // potential buffer overrun. + + return WriteFast(ref MemoryMarshal.GetReference(buffer), value); + } +#endif + + return WriteSlow(buffer, value); + } + + #if NET6_0_OR_GREATER + + private static int WriteFast(ref byte buffer, uint value) + { + // The use of unsafe writes below is made safe because this method is never + // called without at least 8 bytes available in the buffer. + + if (value < 128) + { + // We need to special case 0 to ensure we write one byte, so go ahead and + // special case 0-127, which all write only one byte with the continuation bit unset. + + buffer = (byte)value; + return 1; + } + + // First get the value spread onto an ulong with 7 bit groups + + ulong result = Spread7BitGroupsIntoBytes(value); + + // Next, calculate the size of the output in bytes + + int unusedBytes = BitOperations.LeadingZeroCount(result) >>> 3; // right shift is the equivalent of divide by 8 + + // Build a mask to set the continuation bits + + const ulong allContinuationBits = 0x8080808080808080UL; + ulong mask = allContinuationBits >>> ((unusedBytes + 1) << 3); // left shift is the equivalent of multiply by 8 + + // Finally, write the result to the buffer + + Unsafe.WriteUnaligned(ref buffer, result | mask); + + return sizeof(ulong) - unusedBytes; + } + + // This spreads the 4 bytes of an uint into the lower 5 bytes of an 8 byte ulong + // as 7 bit blocks, with the high bit of each byte set to 0. This is the basis + // of LEB128 encoding, but without the continuation bit set. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong Spread7BitGroupsIntoBytes(uint value) + { + // Only one of the three branches below will be included in the JIT output + // based on CPU support at runtime + + if (Bmi2.X64.IsSupported) + { + return Bmi2.X64.ParallelBitDeposit(value, 0xf7f7f7f7fUL); + } + + if (Bmi2.IsSupported) + { + // Intel x86 branch, using 32-bit BMI2 instruction + + return Bmi2.ParallelBitDeposit(value, 0x7f7f7f7fU) | + ((value & 0xf0000000UL) << 4); + } + + // Fallback for unsupported CPUs (i.e. ARM) + return value & 0x0000007fUL + | ((value & 0x00003f80UL) << 1) + | ((value & 0x001fc000UL) << 2) + | ((value & 0x0fe00000UL) << 3) + | ((value & 0xf0000000UL) << 4); + } + + #endif + } +} + +/* ************************************************************ + * + * @author Couchbase + * @copyright 2021 Couchbase, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * ************************************************************/