diff --git a/configure.ac b/configure.ac index 74e7c5380..3a8e44545 100644 --- a/configure.ac +++ b/configure.ac @@ -373,12 +373,10 @@ AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86) # CLMUL CRC # ############# -# FIXME: Turn it back on by default once the code has been revised -# to not cause false alarms in sanitizers and thus in OSS Fuzz. AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc], [Do not use carryless multiplication for CRC calculation even if support for it is detected.]), - [], [enable_clmul_crc=no]) + [], [enable_clmul_crc=yes]) ############################ diff --git a/src/common/sysdefs.h b/src/common/sysdefs.h index 5f3785b51..2f9740367 100644 --- a/src/common/sysdefs.h +++ b/src/common/sysdefs.h @@ -156,6 +156,17 @@ typedef unsigned char _Bool; # define __bool_true_false_are_defined 1 #endif +// We may need alignas from C11/C17/C23. +#if __STDC_VERSION__ >= 202311 + // alignas is a keyword in C23. Do nothing. +#elif __STDC_VERSION__ >= 201112 +# include +#elif defined(__GNUC__) || defined(__clang__) +# define alignas(n) __attribute__((__aligned__(n))) +#else +# define alignas(n) +#endif + #include // Visual Studio 2013 update 2 supports only __inline, not inline. diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc index 04cf1ffc5..6c0e37396 100644 --- a/src/liblzma/check/Makefile.inc +++ b/src/liblzma/check/Makefile.inc @@ -5,6 +5,7 @@ ## currently crc32 is always enabled. EXTRA_DIST += \ + check/crc_clmul_consts_gen.c \ check/crc32_tablegen.c \ check/crc64_tablegen.c diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c index 16dbb7467..094fe196a 100644 --- a/src/liblzma/check/crc32_fast.c +++ b/src/liblzma/check/crc32_fast.c @@ -15,7 +15,7 @@ #include "crc_common.h" #if defined(CRC_X86_CLMUL) -# define BUILDING_CRC32_CLMUL +# define BUILDING_CRC_CLMUL 32 # include "crc_x86_clmul.h" #elif defined(CRC32_ARM64) # include "crc32_arm64.h" @@ -164,27 +164,6 @@ extern LZMA_API(uint32_t) lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) { #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) - // On x86-64, if CLMUL is available, it is the best for non-tiny - // inputs, being over twice as fast as the generic slice-by-four - // version. However, for size <= 16 it's different. In the extreme - // case of size == 1 the generic version can be five times faster. - // At size >= 8 the CLMUL starts to become reasonable. It - // varies depending on the alignment of buf too. - // - // The above doesn't include the overhead of mythread_once(). - // At least on x86-64 GNU/Linux, pthread_once() is very fast but - // it still makes lzma_crc32(buf, 1, crc) 50-100 % slower. When - // size reaches 12-16 bytes the overhead becomes negligible. - // - // So using the generic version for size <= 16 may give better - // performance with tiny inputs but if such inputs happen rarely - // it's not so obvious because then the lookup table of the - // generic version may not be in the processor cache. -#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS - if (size <= 16) - return crc32_generic(buf, size, crc); -#endif - /* #ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR // See crc32_dispatch(). This would be the alternative which uses diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index 0ce83fe4a..275fcade2 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -14,7 +14,7 @@ #include "crc_common.h" #if defined(CRC_X86_CLMUL) -# define BUILDING_CRC64_CLMUL +# define BUILDING_CRC_CLMUL 64 # include "crc_x86_clmul.h" #endif @@ -133,12 +133,15 @@ crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc) extern LZMA_API(uint64_t) lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) { -#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED) - -#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS - if (size <= 16) - return crc64_generic(buf, size, crc); +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ + && defined(_M_IX86) && defined(CRC64_ARCH_OPTIMIZED) + // VS2015-2022 might corrupt the ebx register on 32-bit x86 when + // the CLMUL code is enabled. This hack forces MSVC to store and + // restore ebx. This is only needed here, not in lzma_crc32(). + __asm mov ebx, ebx #endif + +#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED) return crc64_func(buf, size, crc); #elif defined(CRC64_ARCH_OPTIMIZED) diff --git a/src/liblzma/check/crc_clmul_consts_gen.c b/src/liblzma/check/crc_clmul_consts_gen.c new file mode 100644 index 000000000..5fe14bd6f --- /dev/null +++ b/src/liblzma/check/crc_clmul_consts_gen.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc_clmul_consts_gen.c +/// \brief Generate constants for CLMUL CRC code +/// +/// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c +/// +/// This is for CRCs that use reversed bit order (bit reflection). +/// The same CLMUL CRC code can be used with CRC64 and smaller ones like +/// CRC32 apart from one special case: CRC64 needs an extra step in the +/// Barrett reduction to handle the 65th bit; the smaller ones don't. +/// Otherwise it's enough to just change the polynomial and the derived +/// constants and use the same code. +/// +/// See the Intel white paper "Fast CRC Computation for Generic Polynomials +/// Using PCLMULQDQ Instruction" from 2009. +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#include +#include + + +/// CRC32 (Ethernet) polynomial in reversed representation +static const uint64_t p32 = 0xedb88320; + +// CRC64 (ECMA-182) polynomial in reversed representation +static const uint64_t p64 = 0xc96c5795d7870f42; + + +/// Calculates floor(x^128 / p) where p is a CRC64 polynomial in +/// reversed representation. The result is in reversed representation too. +static uint64_t +calc_cldiv(uint64_t p) +{ + // Quotient + uint64_t q = 0; + + // Align the x^64 term with the x^128 (the implied high bits of the + // divisor and the dividend) and do the first step of polynomial long + // division, calculating the first remainder. The variable q remains + // zero because the highest bit of the quotient is an implied bit 1 + // (we kind of set q = 1 << -1). + uint64_t r = p; + + // Then process the remaining 64 terms. Note that r has no implied + // high bit, only q and p do. (And remember that a high bit in the + // polynomial is stored at a low bit in the variable due to the + // reversed bit order.) + for (unsigned i = 0; i < 64; ++i) { + q |= (r & 1) << i; + r = (r >> 1) ^ (r & 1 ? p : 0); + } + + return q; +} + + +/// Calculate the remainder of carryless division: +/// +/// x^(bits + n - 1) % p, where n=64 (for CRC64) +/// +/// p must be in reversed representation which omits the bit of +/// the highest term of the polynomial. Instead, it is an implied bit +/// at kind of like "1 << -1" position, as if it had just been shifted out. +/// +/// The return value is in the reversed bit order. (There are no implied bits.) +static uint64_t +calc_clrem(uint64_t p, unsigned bits) +{ + // Do the first step of polynomial long division. + uint64_t r = p; + + // Then process the remaining terms. Start with i = 1 instead of i = 0 + // to account for the -1 in x^(bits + n - 1). This -1 is convenient + // with the reversed bit order. See the "Bit-Reflection" section in + // the Intel white paper. + for (unsigned i = 1; i < bits; ++i) + r = (r >> 1) ^ (r & 1 ? p : 0); + + return r; +} + + +extern int +main(void) +{ + puts("// CRC64"); + + // The order of the two 64-bit constants in a vector don't matter. + // It feels logical to put them in this order as it matches the + // order in which the input bytes are read. + printf("const __m128i fold512 = _mm_set_epi64x(" + "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n", + calc_clrem(p64, 4 * 128 - 64), + calc_clrem(p64, 4 * 128)); + + printf("const __m128i fold128 = _mm_set_epi64x(" + "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n", + calc_clrem(p64, 128 - 64), + calc_clrem(p64, 128)); + + // When we multiply by mu, we care about the high bits of the result + // (in reversed bit order!). It doesn't matter that the low bit gets + // shifted out because the affected output bits will be ignored. + // Below we add the implied high bit with "| 1" after the shifting + // so that the high bits of the multiplication will be correct. + // + // p64 is shifted left by one so that the final multiplication + // in Barrett reduction won't be misaligned by one bit. We could + // use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes + // no difference as that bit won't affect the relevant output bits + // (we only care about the lowest 64 bits of the result, that is, + // lowest in the reversed bit order). + // + // NOTE: The 65rd bit of p64 gets shifted out. It needs to be + // compensated with 64-bit shift and xor in the CRC64 code. + printf("const __m128i mu_p = _mm_set_epi64x(" + "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n", + (calc_cldiv(p64) << 1) | 1, + p64 << 1); + + puts(""); + + puts("// CRC32"); + + printf("const __m128i fold512 = _mm_set_epi64x(" + "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n", + calc_clrem(p32, 4 * 128 - 64), + calc_clrem(p32, 4 * 128)); + + printf("const __m128i fold128 = _mm_set_epi64x(" + "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n", + calc_clrem(p32, 128 - 64), + calc_clrem(p32, 128)); + + // CRC32 calculation is done by modulus scaling it to a CRC64. + // Since the CRC is in reversed representation, only the mu + // constant changes with the modulus scaling. This method avoids + // one additional constant and one additional clmul in the final + // reduction steps, making the code both simpler and faster. + // + // p32 is shifted left by one so that the final multiplication + // in Barrett reduction won't be misaligned by one bit. We could + // use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes + // no difference as that bit won't affect the relevant output bits. + // + // NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64, + // there is no need to compensate for any missing bits in the code. + printf("const __m128i mu_p = _mm_set_epi64x(" + "0x%016" PRIx64 ", 0x%" PRIx64 ");\n", + (calc_cldiv(p32) << 1) | 1, + p32 << 1); + + return 0; +} diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h index 8842a9d18..7106646f0 100644 --- a/src/liblzma/check/crc_common.h +++ b/src/liblzma/check/crc_common.h @@ -38,15 +38,6 @@ #endif -// CRC CLMUL code needs this because accessing input buffers that aren't -// aligned to the vector size will inherently trip the address sanitizer. -#if lzma_has_attribute(__no_sanitize_address__) -# define crc_attr_no_sanitize_address \ - __attribute__((__no_sanitize_address__)) -#else -# define crc_attr_no_sanitize_address -#endif - // Keep this in sync with changes to crc32_arm64.h #if defined(_WIN32) || defined(HAVE_GETAUXVAL) \ || defined(HAVE_ELF_AUX_INFO) \ @@ -68,8 +59,6 @@ #undef CRC32_ARM64 #undef CRC64_ARM64_CLMUL -#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS - // ARM64 CRC32 instruction is only useful for CRC32. Currently, only // little endian is supported since we were unable to test on a big // endian machine. @@ -108,18 +97,6 @@ # define CRC32_ARCH_OPTIMIZED 1 # define CRC64_ARCH_OPTIMIZED 1 # define CRC_X86_CLMUL 1 - -/* - // The generic code is much faster with 1-8-byte inputs and - // has similar performance up to 16 bytes at least in - // microbenchmarks (it depends on input buffer alignment - // too). If both versions are built, this #define will use - // the generic version for inputs up to 16 bytes and CLMUL - // for bigger inputs. It saves a little in code size since - // the special cases for 0-16-byte inputs will be omitted - // from the CLMUL code. -# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1 -*/ # endif #endif diff --git a/src/liblzma/check/crc_x86_clmul.h b/src/liblzma/check/crc_x86_clmul.h index 6baf83eb8..926476544 100644 --- a/src/liblzma/check/crc_x86_clmul.h +++ b/src/liblzma/check/crc_x86_clmul.h @@ -8,26 +8,25 @@ /// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and /// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too. /// -/// They were derived from +/// See the Intel white paper "Fast CRC Computation for Generic Polynomials +/// Using PCLMULQDQ Instruction" from 2009. The original file seems to be +/// gone from Intel's website but a version is available here: /// https://www.researchgate.net/publication/263424619_Fast_CRC_computation -/// and the public domain code from https://github.com/rawrunprotected/crc -/// (URLs were checked on 2023-10-14). +/// (The link was checked on 2024-06-11.) /// /// While this file has both CRC32 and CRC64 implementations, only one -/// should be built at a time to ensure that crc_simd_body() is inlined -/// even with compilers with which lzma_always_inline expands to plain inline. -/// The version to build is selected by defining BUILDING_CRC32_CLMUL or -/// BUILDING_CRC64_CLMUL before including this file. +/// can be built at a time. The version to build is selected by defining +/// BUILDING_CRC_CLMUL to 32 or 64 before including this file. /// /// FIXME: Builds for 32-bit x86 use the assembly .S files by default /// unless configured with --disable-assembler. Even then the lookup table /// isn't omitted in crc64_table.c since it doesn't know that assembly /// code has been disabled. +/// +/// NOTE: The x86 CLMUL CRC implementation was rewritten for XZ Utils 5.8.0. // -// Authors: Ilya Kurdyukov -// Hans Jansen -// Lasse Collin -// Jia Tan +// Authors: Lasse Collin +// Ilya Kurdyukov // /////////////////////////////////////////////////////////////////////////////// @@ -37,6 +36,10 @@ #endif #define LZMA_CRC_X86_CLMUL_H +#if BUILDING_CRC_CLMUL != 32 && BUILDING_CRC_CLMUL != 64 +# error BUILDING_CRC_CLMUL is undefined or has an invalid value +#endif + #include #if defined(_MSC_VER) @@ -59,332 +62,282 @@ #endif -#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask) +// GCC and Clang would produce good code with _mm_set_epi64x +// but MSVC needs _mm_cvtsi64_si128 on x86-64. +#if defined(__i386__) || defined(_M_IX86) +# define my_set_low64(a) _mm_set_epi64x(0, (a)) +#else +# define my_set_low64(a) _mm_cvtsi64_si128(a) +#endif -#define MASK_H(in, mask, r) \ - r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign)) -#define MASK_LH(in, mask, low, high) \ - MASK_L(in, mask, low); \ - MASK_H(in, mask, high) +// Align it so that the whole array is within the same cache line. +// More than one unaligned load can be done from this during the +// same CRC function call. +// +// The bytes [0] to [31] are used with AND to clear the low bytes. (With ANDN +// those could be used to clear the high bytes too but it's not needed here.) +// +// The bytes [16] to [47] are for left shifts. +// The bytes [32] to [63] are for right shifts. +alignas(64) +static uint8_t vmasks[64] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, +}; + + +// *Unaligned* 128-bit load +crc_attr_target +static inline __m128i +my_load128(const uint8_t *p) +{ + return _mm_loadu_si128((const __m128i *)p); +} +// Keep the highest "count" bytes as is and clear the remaining low bytes. crc_attr_target -crc_attr_no_sanitize_address -static lzma_always_inline void -crc_simd_body(const uint8_t *buf, const size_t size, __m128i *v0, __m128i *v1, - const __m128i vfold16, const __m128i initial_crc) +static inline __m128i +keep_high_bytes(__m128i v, size_t count) { - // Create a vector with 8-bit values 0 to 15. This is used to - // construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8. - const __m128i vramp = _mm_setr_epi32( - 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c); - - // This is used to inverse the control mask of _mm_shuffle_epi8 - // so that bytes that wouldn't be picked with the original mask - // will be picked and vice versa. - const __m128i vsign = _mm_set1_epi8(-0x80); + return _mm_and_si128(my_load128((vmasks + count)), v); +} - // Memory addresses A to D and the distances between them: - // - // A B C D - // [skip_start][size][skip_end] - // [ size2 ] - // - // A and D are 16-byte aligned. B and C are 1-byte aligned. - // skip_start and skip_end are 0-15 bytes. size is at least 1 byte. - // - // A = aligned_buf will initially point to this address. - // B = The address pointed by the caller-supplied buf. - // C = buf + size == aligned_buf + size2 - // D = buf + size + skip_end == aligned_buf + size2 + skip_end - const size_t skip_start = (size_t)((uintptr_t)buf & 15); - const size_t skip_end = (size_t)((0U - (uintptr_t)(buf + size)) & 15); - const __m128i *aligned_buf = (const __m128i *)( - (uintptr_t)buf & ~(uintptr_t)15); - - // If size2 <= 16 then the whole input fits into a single 16-byte - // vector. If size2 > 16 then at least two 16-byte vectors must - // be processed. If size2 > 16 && size <= 16 then there is only - // one 16-byte vector's worth of input but it is unaligned in memory. - // - // NOTE: There is no integer overflow here if the arguments - // are valid. If this overflowed, buf + size would too. - const size_t size2 = skip_start + size; - - // Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8: - // The first skip_start or skip_end bytes in the vectors will have - // the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8 - // will produce zeros for these positions. (Bitwise-xor of these - // masks with vsign will produce the opposite behavior.) - const __m128i mask_start - = _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_start)); - const __m128i mask_end - = _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_end)); - - // Get the first 1-16 bytes into data0. If loading less than 16 - // bytes, the bytes are loaded to the high bits of the vector and - // the least significant positions are filled with zeros. - const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf), - _mm_setzero_si128(), mask_start); - aligned_buf++; - - __m128i v2, v3; - -#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS - if (size <= 16) { - // Right-shift initial_crc by 1-16 bytes based on "size" - // and store the result in v1 (high bytes) and v0 (low bytes). - // - // NOTE: The highest 8 bytes of initial_crc are zeros so - // v1 will be filled with zeros if size >= 8. The highest - // 8 bytes of v1 will always become zeros. - // - // [ v1 ][ v0 ] - // [ initial_crc ] size == 1 - // [ initial_crc ] size == 2 - // [ initial_crc ] size == 15 - // [ initial_crc ] size == 16 (all in v0) - const __m128i mask_low = _mm_add_epi8( - vramp, _mm_set1_epi8((char)(size - 16))); - MASK_LH(initial_crc, mask_low, *v0, *v1); - - if (size2 <= 16) { - // There are 1-16 bytes of input and it is all - // in data0. Copy the input bytes to v3. If there - // are fewer than 16 bytes, the low bytes in v3 - // will be filled with zeros. That is, the input - // bytes are stored to the same position as - // (part of) initial_crc is in v0. - MASK_L(data0, mask_end, v3); - } else { - // There are 2-16 bytes of input but not all bytes - // are in data0. - const __m128i data1 = _mm_load_si128(aligned_buf); - - // Collect the 2-16 input bytes from data0 and data1 - // to v2 and v3, and bitwise-xor them with the - // low bits of initial_crc in v0. Note that the - // the second xor is below this else-block as it - // is shared with the other branch. - MASK_H(data0, mask_end, v2); - MASK_L(data1, mask_end, v3); - *v0 = _mm_xor_si128(*v0, v2); - } - *v0 = _mm_xor_si128(*v0, v3); - *v1 = _mm_alignr_epi8(*v1, *v0, 8); - } else -#endif - { - // There is more than 16 bytes of input. - const __m128i data1 = _mm_load_si128(aligned_buf); - const __m128i *end = (const __m128i*)( - (const char *)aligned_buf - 16 + size2); - aligned_buf++; - - MASK_LH(initial_crc, mask_start, *v0, *v1); - *v0 = _mm_xor_si128(*v0, data0); - *v1 = _mm_xor_si128(*v1, data1); - - while (aligned_buf < end) { - *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( - *v0, vfold16, 0x00)); - *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( - *v0, vfold16, 0x11)); - *v1 = _mm_load_si128(aligned_buf++); - } +// Shift the 128-bit value left by "amount" bytes (not bits). +crc_attr_target +static inline __m128i +shift_left(__m128i v, size_t amount) +{ + return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount))); +} - if (aligned_buf != end) { - MASK_H(*v0, mask_end, v2); - MASK_L(*v0, mask_end, *v0); - MASK_L(*v1, mask_end, v3); - *v1 = _mm_or_si128(v2, v3); - } - *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( - *v0, vfold16, 0x00)); - *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( - *v0, vfold16, 0x11)); - *v1 = _mm_srli_si128(*v0, 8); - } +// Shift the 128-bit value right by "amount" bytes (not bits). +crc_attr_target +static inline __m128i +shift_right(__m128i v, size_t amount) +{ + return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount))); } -///////////////////// -// x86 CLMUL CRC32 // -///////////////////// - -/* -// These functions were used to generate the constants -// at the top of crc32_arch_optimized(). -static uint64_t -calc_lo(uint64_t p, uint64_t a, int n) +crc_attr_target +static inline __m128i +fold(__m128i v, __m128i k) { - uint64_t b = 0; int i; - for (i = 0; i < n; i++) { - b = b >> 1 | (a & 1) << (n - 1); - a = (a >> 1) ^ ((0 - (a & 1)) & p); - } - return b; + __m128i a = _mm_clmulepi64_si128(v, k, 0x00); + __m128i b = _mm_clmulepi64_si128(v, k, 0x11); + return _mm_xor_si128(a, b); } -// same as ~crc(&a, sizeof(a), ~0) -static uint64_t -calc_hi(uint64_t p, uint64_t a, int n) + +crc_attr_target +static inline __m128i +fold_xor(__m128i v, __m128i k, const uint8_t *buf) { - int i; - for (i = 0; i < n; i++) - a = (a >> 1) ^ ((0 - (a & 1)) & p); - return a; + return _mm_xor_si128(my_load128(buf), fold(v, k)); } -*/ -#ifdef BUILDING_CRC32_CLMUL +#if BUILDING_CRC_CLMUL == 32 crc_attr_target -crc_attr_no_sanitize_address static uint32_t crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) +#else +crc_attr_target +static uint64_t +crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc) +#endif { -#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS - // The code assumes that there is at least one byte of input. + // We will assume that there is at least one byte of input. if (size == 0) return crc; -#endif - - // uint32_t poly = 0xedb88320; - const int64_t p = 0x1db710640; // p << 1 - const int64_t mu = 0x1f7011641; // calc_lo(p, p, 32) << 1 | 1 - const int64_t k5 = 0x163cd6124; // calc_hi(p, p, 32) << 1 - const int64_t k4 = 0x0ccaa009e; // calc_hi(p, p, 64) << 1 - const int64_t k3 = 0x1751997d0; // calc_hi(p, p, 128) << 1 - - const __m128i vfold4 = _mm_set_epi64x(mu, p); - const __m128i vfold8 = _mm_set_epi64x(0, k5); - const __m128i vfold16 = _mm_set_epi64x(k4, k3); - - __m128i v0, v1, v2; - - crc_simd_body(buf, size, &v0, &v1, vfold16, - _mm_cvtsi32_si128((int32_t)~crc)); - - v1 = _mm_xor_si128( - _mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0 - v2 = _mm_shuffle_epi32(v1, 0xe7); // 0xx0 - v0 = _mm_slli_epi64(v1, 32); // [0] - v0 = _mm_clmulepi64_si128(v0, vfold8, 0x00); - v0 = _mm_xor_si128(v0, v2); // [1] [2] - v2 = _mm_clmulepi64_si128(v0, vfold4, 0x10); - v2 = _mm_clmulepi64_si128(v2, vfold4, 0x00); - v0 = _mm_xor_si128(v0, v2); // [2] - return ~(uint32_t)_mm_extract_epi32(v0, 2); -} -#endif // BUILDING_CRC32_CLMUL + // See crc_clmul_consts_gen.c. +#if BUILDING_CRC_CLMUL == 32 + const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95); + const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191); + const __m128i mu_p = _mm_set_epi64x( + (int64_t)0xb4e5b025f7011641, 0x1db710640); +#else + const __m128i fold512 = _mm_set_epi64x( + (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3); -///////////////////// -// x86 CLMUL CRC64 // -///////////////////// + const __m128i fold128 = _mm_set_epi64x( + (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4); -/* -// These functions were used to generate the constants -// at the top of crc64_arch_optimized(). -static uint64_t -calc_lo(uint64_t poly) -{ - uint64_t a = poly; - uint64_t b = 0; + const __m128i mu_p = _mm_set_epi64x( + (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84); +#endif - for (unsigned i = 0; i < 64; ++i) { - b = (b >> 1) | (a << 63); - a = (a >> 1) ^ (a & 1 ? poly : 0); - } + __m128i v0, v1, v2, v3; - return b; -} + crc = ~crc; -static uint64_t -calc_hi(uint64_t poly, uint64_t a) -{ - for (unsigned i = 0; i < 64; ++i) - a = (a >> 1) ^ (a & 1 ? poly : 0); + if (size < 8) { + uint64_t x = crc; + size_t i = 0; - return a; -} -*/ + // Checking the bit instead of comparing the size means + // that we don't need to update the size between the steps. + if (size & 4) { + x ^= read32le(buf); + buf += 4; + i = 32; + } -#ifdef BUILDING_CRC64_CLMUL + if (size & 2) { + x ^= (uint64_t)read16le(buf) << i; + buf += 2; + i += 16; + } -// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC -// code when optimizations are enabled (release build). According to the bug -// report, the ebx register is corrupted and the calculated result is wrong. -// Trying to workaround the problem with "__asm mov ebx, ebx" didn't help. -// The following pragma works and performance is still good. x86-64 builds -// and CRC32 CLMUL aren't affected by this problem. The problem does not -// happen in crc_simd_body() either (which is shared with CRC32 CLMUL anyway). -// -// NOTE: Another pragma after crc64_arch_optimized() restores -// the optimizations. If the #if condition here is updated, -// the other one must be updated too. -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ - && defined(_M_IX86) -# pragma optimize("g", off) -#endif + if (size & 1) + x ^= (uint64_t)*buf << i; -crc_attr_target -crc_attr_no_sanitize_address -static uint64_t -crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc) -{ -#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS - // The code assumes that there is at least one byte of input. - if (size == 0) - return crc; -#endif + v0 = my_set_low64((int64_t)x); + v0 = shift_left(v0, 8 - size); - // const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial - const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1 - const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1 - const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1) - const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2) + } else if (size < 16) { + v0 = my_set_low64((int64_t)(crc ^ read64le(buf))); - const __m128i vfold8 = _mm_set_epi64x((int64_t)p, (int64_t)mu); - const __m128i vfold16 = _mm_set_epi64x((int64_t)k2, (int64_t)k1); + // NOTE: buf is intentionally left 8 bytes behind so that + // we can read the last 1-7 bytes with read64le(buf + size). + size -= 8; - __m128i v0, v1, v2; + // Handling 8-byte input specially is a speed optimization + // as the clmul can be skipped. A branch is also needed to + // avoid a too high shift amount. + if (size > 0) { + const size_t padding = 8 - size; + uint64_t high = read64le(buf + size) >> (padding * 8); #if defined(__i386__) || defined(_M_IX86) - crc_simd_body(buf, size, &v0, &v1, vfold16, - _mm_set_epi64x(0, (int64_t)~crc)); + // Simple but likely not the best code for 32-bit x86. + v0 = _mm_insert_epi32(v0, (int32_t)high, 2); + v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3); #else - // GCC and Clang would produce good code with _mm_set_epi64x - // but MSVC needs _mm_cvtsi64_si128 on x86-64. - crc_simd_body(buf, size, &v0, &v1, vfold16, - _mm_cvtsi64_si128((int64_t)~crc)); + v0 = _mm_insert_epi64(v0, (int64_t)high, 1); #endif - v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); - v0 = _mm_clmulepi64_si128(v1, vfold8, 0x00); - v2 = _mm_clmulepi64_si128(v0, vfold8, 0x10); - v0 = _mm_xor_si128(_mm_xor_si128(v1, _mm_slli_si128(v0, 8)), v2); + v0 = shift_left(v0, padding); + + v1 = _mm_srli_si128(v0, 8); + v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); + v0 = _mm_xor_si128(v0, v1); + } + } else { + v0 = my_set_low64((int64_t)crc); + + // To align or not to align the buf pointer? If the end of + // the buffer isn't aligned, aligning the pointer here would + // make us do an extra folding step with the associated byte + // shuffling overhead. The cost of that would need to be + // lower than the benefit of aligned reads. Testing on an old + // Intel Ivy Bridge processor suggested that aligning isn't + // worth the cost but it likely depends on the processor and + // buffer size. Unaligned loads (MOVDQU) should be fast on + // x86 processors that support PCLMULQDQ, so we don't align + // the buf pointer here. + + // Read the first (and possibly the only) full 16 bytes. + v0 = _mm_xor_si128(v0, my_load128(buf)); + buf += 16; + size -= 16; + + if (size >= 48) { + v1 = my_load128(buf); + v2 = my_load128(buf + 16); + v3 = my_load128(buf + 32); + buf += 48; + size -= 48; + + while (size >= 64) { + v0 = fold_xor(v0, fold512, buf); + v1 = fold_xor(v1, fold512, buf + 16); + v2 = fold_xor(v2, fold512, buf + 32); + v3 = fold_xor(v3, fold512, buf + 48); + buf += 64; + size -= 64; + } + + v0 = _mm_xor_si128(v1, fold(v0, fold128)); + v0 = _mm_xor_si128(v2, fold(v0, fold128)); + v0 = _mm_xor_si128(v3, fold(v0, fold128)); + } + + while (size >= 16) { + v0 = fold_xor(v0, fold128, buf); + buf += 16; + size -= 16; + } + + if (size > 0) { + // We want the last "size" number of input bytes to + // be at the high bits of v1. First do a full 16-byte + // load and then mask the low bytes to zeros. + v1 = my_load128(buf + size - 16); + v1 = keep_high_bytes(v1, size); + + // Shift high bytes from v0 to the low bytes of v1. + // + // Alternatively we could replace the combination + // keep_high_bytes + shift_right + _mm_or_si128 with + // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would + // require larger tables for the masks. Now there are + // three loads (instead of two) from the mask tables + // but they all are from the same cache line. + v1 = _mm_or_si128(v1, shift_right(v0, size)); + + // Shift high bytes of v0 away, padding the + // low bytes with zeros. + v0 = shift_left(v0, 16 - size); + + v0 = _mm_xor_si128(v1, fold(v0, fold128)); + } + + v1 = _mm_srli_si128(v0, 8); + v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); + v0 = _mm_xor_si128(v0, v1); + } + + // Barrett reduction +#if BUILDING_CRC_CLMUL == 32 + v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu + v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p + v0 = _mm_xor_si128(v0, v1); + return ~(uint32_t)_mm_extract_epi32(v0, 2); +#else + // Because p is 65 bits but one bit doesn't fit into the 64-bit + // half of __m128i, finish the second clmul by shifting v1 left + // by 64 bits and xorring it to the final result. + v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu + v2 = _mm_slli_si128(v1, 8); + v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p + v0 = _mm_xor_si128(v0, v2); + v0 = _mm_xor_si128(v0, v1); #if defined(__i386__) || defined(_M_IX86) return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); #else return ~(uint64_t)_mm_extract_epi64(v0, 1); #endif -} - -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ - && defined(_M_IX86) -# pragma optimize("", on) #endif - -#endif // BUILDING_CRC64_CLMUL +} +// Even though this is an inline function, compile it only when needed. +// This way it won't appear in E2K builds at all. +#if defined(CRC32_GENERIC) || defined(CRC64_GENERIC) // Inlining this function duplicates the function body in crc32_resolve() and // crc64_resolve(), but this is acceptable because this is a tiny function. static inline bool @@ -426,3 +379,4 @@ is_arch_extension_supported(void) // code as is it only reads a variable set at startup but a few bytes // doesn't matter here. } +#endif