tukaani-project · Larhzu · Jun 17, 2024 · Jun 10, 2024 · May 9, 2024 · May 9, 2024
diff --git a/configure.ac b/configure.ac
@@ -373,12 +373,10 @@ AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86)
 # CLMUL CRC #
 #############
 
-# FIXME: Turn it back on by default once the code has been revised
-# to not cause false alarms in sanitizers and thus in OSS Fuzz.
 AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc],
 		[Do not use carryless multiplication for CRC calculation
 		even if support for it is detected.]),
-	[], [enable_clmul_crc=no])
+	[], [enable_clmul_crc=yes])
 
 
 ############################

diff --git a/src/common/sysdefs.h b/src/common/sysdefs.h
@@ -156,6 +156,17 @@ typedef unsigned char _Bool;
 #	define __bool_true_false_are_defined 1
 #endif
 
+// We may need alignas from C11/C17/C23.
+#if __STDC_VERSION__ >= 202311
+	// alignas is a keyword in C23. Do nothing.
+#elif __STDC_VERSION__ >= 201112
+#	include <stdalign.h>
+#elif defined(__GNUC__) || defined(__clang__)
+#	define alignas(n) __attribute__((__aligned__(n)))
+#else
+#	define alignas(n)
+#endif
+
 #include <string.h>
 
 // Visual Studio 2013 update 2 supports only __inline, not inline.

diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc
@@ -5,6 +5,7 @@
 ## currently crc32 is always enabled.
 
 EXTRA_DIST += \
+	check/crc_clmul_consts_gen.c \
 	check/crc32_tablegen.c \
 	check/crc64_tablegen.c
 

diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c
@@ -15,7 +15,7 @@
 #include "crc_common.h"
 
 #if defined(CRC_X86_CLMUL)
-#	define BUILDING_CRC32_CLMUL
+#	define BUILDING_CRC_CLMUL 32
 #	include "crc_x86_clmul.h"
 #elif defined(CRC32_ARM64)
 #	include "crc32_arm64.h"
@@ -164,27 +164,6 @@ extern LZMA_API(uint32_t)
 lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
 {
 #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
-	// On x86-64, if CLMUL is available, it is the best for non-tiny
-	// inputs, being over twice as fast as the generic slice-by-four
-	// version. However, for size <= 16 it's different. In the extreme
-	// case of size == 1 the generic version can be five times faster.
-	// At size >= 8 the CLMUL starts to become reasonable. It
-	// varies depending on the alignment of buf too.
-	//
-	// The above doesn't include the overhead of mythread_once().
-	// At least on x86-64 GNU/Linux, pthread_once() is very fast but
-	// it still makes lzma_crc32(buf, 1, crc) 50-100 % slower. When
-	// size reaches 12-16 bytes the overhead becomes negligible.
-	//
-	// So using the generic version for size <= 16 may give better
-	// performance with tiny inputs but if such inputs happen rarely
-	// it's not so obvious because then the lookup table of the
-	// generic version may not be in the processor cache.
-#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
-	if (size <= 16)
-		return crc32_generic(buf, size, crc);
-#endif
-
 /*
 #ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
 	// See crc32_dispatch(). This would be the alternative which uses

diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c
@@ -14,7 +14,7 @@
 #include "crc_common.h"
 
 #if defined(CRC_X86_CLMUL)
-#	define BUILDING_CRC64_CLMUL
+#	define BUILDING_CRC_CLMUL 64
 #	include "crc_x86_clmul.h"
 #endif
 
@@ -133,12 +133,15 @@ crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc)
 extern LZMA_API(uint64_t)
 lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
 {
-#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED)
-
-#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
-	if (size <= 16)
-		return crc64_generic(buf, size, crc);
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
+		&& defined(_M_IX86) && defined(CRC64_ARCH_OPTIMIZED)
+	// VS2015-2022 might corrupt the ebx register on 32-bit x86 when
+	// the CLMUL code is enabled. This hack forces MSVC to store and
+	// restore ebx. This is only needed here, not in lzma_crc32().
+	__asm  mov ebx, ebx
 #endif
+
+#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED)
 	return crc64_func(buf, size, crc);
 
 #elif defined(CRC64_ARCH_OPTIMIZED)

diff --git a/src/liblzma/check/crc_clmul_consts_gen.c b/src/liblzma/check/crc_clmul_consts_gen.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       crc_clmul_consts_gen.c
+/// \brief      Generate constants for CLMUL CRC code
+///
+/// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c
+///
+/// This is for CRCs that use reversed bit order (bit reflection).
+/// The same CLMUL CRC code can be used with CRC64 and smaller ones like
+/// CRC32 apart from one special case: CRC64 needs an extra step in the
+/// Barrett reduction to handle the 65th bit; the smaller ones don't.
+/// Otherwise it's enough to just change the polynomial and the derived
+/// constants and use the same code.
+///
+/// See the Intel white paper "Fast CRC Computation for Generic Polynomials
+/// Using PCLMULQDQ Instruction" from 2009.
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+/// CRC32 (Ethernet) polynomial in reversed representation
+static const uint64_t p32 = 0xedb88320;
+
+// CRC64 (ECMA-182) polynomial in reversed representation
+static const uint64_t p64 = 0xc96c5795d7870f42;
+
+
+/// Calculates floor(x^128 / p) where p is a CRC64 polynomial in
+/// reversed representation. The result is in reversed representation too.
+static uint64_t
+calc_cldiv(uint64_t p)
+{
+	// Quotient
+	uint64_t q = 0;
+
+	// Align the x^64 term with the x^128 (the implied high bits of the
+	// divisor and the dividend) and do the first step of polynomial long
+	// division, calculating the first remainder. The variable q remains
+	// zero because the highest bit of the quotient is an implied bit 1
+	// (we kind of set q = 1 << -1).
+	uint64_t r = p;
+
+	// Then process the remaining 64 terms. Note that r has no implied
+	// high bit, only q and p do. (And remember that a high bit in the
+	// polynomial is stored at a low bit in the variable due to the
+	// reversed bit order.)
+	for (unsigned i = 0; i < 64; ++i) {
+		q |= (r & 1) << i;
+		r = (r >> 1) ^ (r & 1 ? p : 0);
+	}
+
+	return q;
+}
+
+
+/// Calculate the remainder of carryless division:
+///
+///     x^(bits + n - 1) % p, where n=64 (for CRC64)
+///
+/// p must be in reversed representation which omits the bit of
+/// the highest term of the polynomial. Instead, it is an implied bit
+/// at kind of like "1 << -1" position, as if it had just been shifted out.
+///
+/// The return value is in the reversed bit order. (There are no implied bits.)
+static uint64_t
+calc_clrem(uint64_t p, unsigned bits)
+{
+	// Do the first step of polynomial long division.
+	uint64_t r = p;
+
+	// Then process the remaining terms. Start with i = 1 instead of i = 0
+	// to account for the -1 in x^(bits + n - 1). This -1 is convenient
+	// with the reversed bit order. See the "Bit-Reflection" section in
+	// the Intel white paper.
+	for (unsigned i = 1; i < bits; ++i)
+		r = (r >> 1) ^ (r & 1 ? p : 0);
+
+	return r;
+}
+
+
+extern int
+main(void)
+{
+	puts("// CRC64");
+
+	// The order of the two 64-bit constants in a vector don't matter.
+	// It feels logical to put them in this order as it matches the
+	// order in which the input bytes are read.
+	printf("const __m128i fold512 = _mm_set_epi64x("
+		"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
+		calc_clrem(p64, 4 * 128 - 64),
+		calc_clrem(p64, 4 * 128));
+
+	printf("const __m128i fold128 = _mm_set_epi64x("
+		"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
+		calc_clrem(p64, 128 - 64),
+		calc_clrem(p64, 128));
+
+	// When we multiply by mu, we care about the high bits of the result
+	// (in reversed bit order!). It doesn't matter that the low bit gets
+	// shifted out because the affected output bits will be ignored.
+	// Below we add the implied high bit with "| 1" after the shifting
+	// so that the high bits of the multiplication will be correct.
+	//
+	// p64 is shifted left by one so that the final multiplication
+	// in Barrett reduction won't be misaligned by one bit. We could
+	// use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes
+	// no difference as that bit won't affect the relevant output bits
+	// (we only care about the lowest 64 bits of the result, that is,
+	// lowest in the reversed bit order).
+	//
+	// NOTE: The 65rd bit of p64 gets shifted out. It needs to be
+	// compensated with 64-bit shift and xor in the CRC64 code.
+	printf("const __m128i mu_p = _mm_set_epi64x("
+		"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
+		(calc_cldiv(p64) << 1) | 1,
+		p64 << 1);
+
+	puts("");
+
+	puts("// CRC32");
+
+	printf("const __m128i fold512 = _mm_set_epi64x("
+		"0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
+		calc_clrem(p32, 4 * 128 - 64),
+		calc_clrem(p32, 4 * 128));
+
+	printf("const __m128i fold128 = _mm_set_epi64x("
+		"0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
+		calc_clrem(p32, 128 - 64),
+		calc_clrem(p32, 128));
+
+	// CRC32 calculation is done by modulus scaling it to a CRC64.
+	// Since the CRC is in reversed representation, only the mu
+	// constant changes with the modulus scaling. This method avoids
+	// one additional constant and one additional clmul in the final
+	// reduction steps, making the code both simpler and faster.
+	//
+	// p32 is shifted left by one so that the final multiplication
+	// in Barrett reduction won't be misaligned by one bit. We could
+	// use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes
+	// no difference as that bit won't affect the relevant output bits.
+	//
+	// NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64,
+	// there is no need to compensate for any missing bits in the code.
+	printf("const __m128i mu_p = _mm_set_epi64x("
+		"0x%016" PRIx64 ", 0x%" PRIx64 ");\n",
+		(calc_cldiv(p32) << 1) | 1,
+		p32 << 1);
+
+	return 0;
+}
diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h
@@ -38,15 +38,6 @@
 #endif
 
 
-// CRC CLMUL code needs this because accessing input buffers that aren't
-// aligned to the vector size will inherently trip the address sanitizer.
-#if lzma_has_attribute(__no_sanitize_address__)
-#	define crc_attr_no_sanitize_address \
-			__attribute__((__no_sanitize_address__))
-#else
-#	define crc_attr_no_sanitize_address
-#endif
-
 // Keep this in sync with changes to crc32_arm64.h
 #if defined(_WIN32) || defined(HAVE_GETAUXVAL) \
 		|| defined(HAVE_ELF_AUX_INFO) \
@@ -68,8 +59,6 @@
 #undef CRC32_ARM64
 #undef CRC64_ARM64_CLMUL
 
-#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
-
 // ARM64 CRC32 instruction is only useful for CRC32. Currently, only
 // little endian is supported since we were unable to test on a big
 // endian machine.
@@ -108,18 +97,6 @@
 #		define CRC32_ARCH_OPTIMIZED 1
 #		define CRC64_ARCH_OPTIMIZED 1
 #		define CRC_X86_CLMUL 1
-
-/*
-		// The generic code is much faster with 1-8-byte inputs and
-		// has similar performance up to 16 bytes  at least in
-		// microbenchmarks (it depends on input buffer alignment
-		// too). If both versions are built, this #define will use
-		// the generic version for inputs up to 16 bytes and CLMUL
-		// for bigger inputs. It saves a little in code size since
-		// the special cases for 0-16-byte inputs will be omitted
-		// from the CLMUL code.
-#		define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
-*/
 #	endif
 #endif