Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86 CLMUL CRC rewrite #127

Merged
merged 10 commits into from
Jun 17, 2024
4 changes: 1 addition & 3 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,10 @@ AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86)
# CLMUL CRC #
#############

# FIXME: Turn it back on by default once the code has been revised
# to not cause false alarms in sanitizers and thus in OSS Fuzz.
AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc],
[Do not use carryless multiplication for CRC calculation
even if support for it is detected.]),
[], [enable_clmul_crc=no])
[], [enable_clmul_crc=yes])


############################
Expand Down
11 changes: 11 additions & 0 deletions src/common/sysdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,17 @@ typedef unsigned char _Bool;
# define __bool_true_false_are_defined 1
#endif

// We may need alignas from C11/C17/C23.
#if __STDC_VERSION__ >= 202311
// alignas is a keyword in C23. Do nothing.
#elif __STDC_VERSION__ >= 201112
# include <stdalign.h>
#elif defined(__GNUC__) || defined(__clang__)
# define alignas(n) __attribute__((__aligned__(n)))
#else
# define alignas(n)
#endif

#include <string.h>

// Visual Studio 2013 update 2 supports only __inline, not inline.
Expand Down
1 change: 1 addition & 0 deletions src/liblzma/check/Makefile.inc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
## currently crc32 is always enabled.

EXTRA_DIST += \
check/crc_clmul_consts_gen.c \
check/crc32_tablegen.c \
check/crc64_tablegen.c

Expand Down
23 changes: 1 addition & 22 deletions src/liblzma/check/crc32_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "crc_common.h"

#if defined(CRC_X86_CLMUL)
# define BUILDING_CRC32_CLMUL
# define BUILDING_CRC_CLMUL 32
# include "crc_x86_clmul.h"
#elif defined(CRC32_ARM64)
# include "crc32_arm64.h"
Expand Down Expand Up @@ -164,27 +164,6 @@ extern LZMA_API(uint32_t)
lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
{
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
// On x86-64, if CLMUL is available, it is the best for non-tiny
// inputs, being over twice as fast as the generic slice-by-four
// version. However, for size <= 16 it's different. In the extreme
// case of size == 1 the generic version can be five times faster.
// At size >= 8 the CLMUL starts to become reasonable. It
// varies depending on the alignment of buf too.
//
// The above doesn't include the overhead of mythread_once().
// At least on x86-64 GNU/Linux, pthread_once() is very fast but
// it still makes lzma_crc32(buf, 1, crc) 50-100 % slower. When
// size reaches 12-16 bytes the overhead becomes negligible.
//
// So using the generic version for size <= 16 may give better
// performance with tiny inputs but if such inputs happen rarely
// it's not so obvious because then the lookup table of the
// generic version may not be in the processor cache.
#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
if (size <= 16)
return crc32_generic(buf, size, crc);
#endif

/*
#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
// See crc32_dispatch(). This would be the alternative which uses
Expand Down
15 changes: 9 additions & 6 deletions src/liblzma/check/crc64_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include "crc_common.h"

#if defined(CRC_X86_CLMUL)
# define BUILDING_CRC64_CLMUL
# define BUILDING_CRC_CLMUL 64
# include "crc_x86_clmul.h"
#endif

Expand Down Expand Up @@ -133,12 +133,15 @@ crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc)
extern LZMA_API(uint64_t)
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
{
#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED)

#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
if (size <= 16)
return crc64_generic(buf, size, crc);
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
&& defined(_M_IX86) && defined(CRC64_ARCH_OPTIMIZED)
// VS2015-2022 might corrupt the ebx register on 32-bit x86 when
// the CLMUL code is enabled. This hack forces MSVC to store and
// restore ebx. This is only needed here, not in lzma_crc32().
__asm mov ebx, ebx
#endif

#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED)
return crc64_func(buf, size, crc);

#elif defined(CRC64_ARCH_OPTIMIZED)
Expand Down
160 changes: 160 additions & 0 deletions src/liblzma/check/crc_clmul_consts_gen.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file crc_clmul_consts_gen.c
/// \brief Generate constants for CLMUL CRC code
///
/// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c
///
/// This is for CRCs that use reversed bit order (bit reflection).
/// The same CLMUL CRC code can be used with CRC64 and smaller ones like
/// CRC32 apart from one special case: CRC64 needs an extra step in the
/// Barrett reduction to handle the 65th bit; the smaller ones don't.
/// Otherwise it's enough to just change the polynomial and the derived
/// constants and use the same code.
///
/// See the Intel white paper "Fast CRC Computation for Generic Polynomials
/// Using PCLMULQDQ Instruction" from 2009.
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////

#include <inttypes.h>
#include <stdio.h>


/// CRC32 (Ethernet) polynomial in reversed representation
static const uint64_t p32 = 0xedb88320;

// CRC64 (ECMA-182) polynomial in reversed representation
static const uint64_t p64 = 0xc96c5795d7870f42;


/// Calculates floor(x^128 / p) where p is a CRC64 polynomial in
/// reversed representation. The result is in reversed representation too.
static uint64_t
calc_cldiv(uint64_t p)
{
// Quotient
uint64_t q = 0;

// Align the x^64 term with the x^128 (the implied high bits of the
// divisor and the dividend) and do the first step of polynomial long
// division, calculating the first remainder. The variable q remains
// zero because the highest bit of the quotient is an implied bit 1
// (we kind of set q = 1 << -1).
uint64_t r = p;

// Then process the remaining 64 terms. Note that r has no implied
// high bit, only q and p do. (And remember that a high bit in the
// polynomial is stored at a low bit in the variable due to the
// reversed bit order.)
for (unsigned i = 0; i < 64; ++i) {
q |= (r & 1) << i;
r = (r >> 1) ^ (r & 1 ? p : 0);
}

return q;
}


/// Calculate the remainder of carryless division:
///
/// x^(bits + n - 1) % p, where n=64 (for CRC64)
///
/// p must be in reversed representation which omits the bit of
/// the highest term of the polynomial. Instead, it is an implied bit
/// at kind of like "1 << -1" position, as if it had just been shifted out.
///
/// The return value is in the reversed bit order. (There are no implied bits.)
static uint64_t
calc_clrem(uint64_t p, unsigned bits)
{
// Do the first step of polynomial long division.
uint64_t r = p;

// Then process the remaining terms. Start with i = 1 instead of i = 0
// to account for the -1 in x^(bits + n - 1). This -1 is convenient
// with the reversed bit order. See the "Bit-Reflection" section in
// the Intel white paper.
for (unsigned i = 1; i < bits; ++i)
r = (r >> 1) ^ (r & 1 ? p : 0);

return r;
}


extern int
main(void)
{
puts("// CRC64");

// The order of the two 64-bit constants in a vector don't matter.
// It feels logical to put them in this order as it matches the
// order in which the input bytes are read.
printf("const __m128i fold512 = _mm_set_epi64x("
"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
calc_clrem(p64, 4 * 128 - 64),
calc_clrem(p64, 4 * 128));

printf("const __m128i fold128 = _mm_set_epi64x("
"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
calc_clrem(p64, 128 - 64),
calc_clrem(p64, 128));

// When we multiply by mu, we care about the high bits of the result
// (in reversed bit order!). It doesn't matter that the low bit gets
// shifted out because the affected output bits will be ignored.
// Below we add the implied high bit with "| 1" after the shifting
// so that the high bits of the multiplication will be correct.
//
// p64 is shifted left by one so that the final multiplication
// in Barrett reduction won't be misaligned by one bit. We could
// use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes
// no difference as that bit won't affect the relevant output bits
// (we only care about the lowest 64 bits of the result, that is,
// lowest in the reversed bit order).
//
// NOTE: The 65rd bit of p64 gets shifted out. It needs to be
// compensated with 64-bit shift and xor in the CRC64 code.
printf("const __m128i mu_p = _mm_set_epi64x("
"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
(calc_cldiv(p64) << 1) | 1,
p64 << 1);

puts("");

puts("// CRC32");

printf("const __m128i fold512 = _mm_set_epi64x("
"0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
calc_clrem(p32, 4 * 128 - 64),
calc_clrem(p32, 4 * 128));

printf("const __m128i fold128 = _mm_set_epi64x("
"0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
calc_clrem(p32, 128 - 64),
calc_clrem(p32, 128));

// CRC32 calculation is done by modulus scaling it to a CRC64.
// Since the CRC is in reversed representation, only the mu
// constant changes with the modulus scaling. This method avoids
// one additional constant and one additional clmul in the final
// reduction steps, making the code both simpler and faster.
//
// p32 is shifted left by one so that the final multiplication
// in Barrett reduction won't be misaligned by one bit. We could
// use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes
// no difference as that bit won't affect the relevant output bits.
//
// NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64,
// there is no need to compensate for any missing bits in the code.
printf("const __m128i mu_p = _mm_set_epi64x("
"0x%016" PRIx64 ", 0x%" PRIx64 ");\n",
(calc_cldiv(p32) << 1) | 1,
p32 << 1);

return 0;
}
23 changes: 0 additions & 23 deletions src/liblzma/check/crc_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,6 @@
#endif


// CRC CLMUL code needs this because accessing input buffers that aren't
// aligned to the vector size will inherently trip the address sanitizer.
#if lzma_has_attribute(__no_sanitize_address__)
# define crc_attr_no_sanitize_address \
__attribute__((__no_sanitize_address__))
#else
# define crc_attr_no_sanitize_address
#endif

// Keep this in sync with changes to crc32_arm64.h
#if defined(_WIN32) || defined(HAVE_GETAUXVAL) \
|| defined(HAVE_ELF_AUX_INFO) \
Expand All @@ -68,8 +59,6 @@
#undef CRC32_ARM64
#undef CRC64_ARM64_CLMUL

#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS

// ARM64 CRC32 instruction is only useful for CRC32. Currently, only
// little endian is supported since we were unable to test on a big
// endian machine.
Expand Down Expand Up @@ -108,18 +97,6 @@
# define CRC32_ARCH_OPTIMIZED 1
# define CRC64_ARCH_OPTIMIZED 1
# define CRC_X86_CLMUL 1

/*
// The generic code is much faster with 1-8-byte inputs and
// has similar performance up to 16 bytes at least in
// microbenchmarks (it depends on input buffer alignment
// too). If both versions are built, this #define will use
// the generic version for inputs up to 16 bytes and CLMUL
// for bigger inputs. It saves a little in code size since
// the special cases for 0-16-byte inputs will be omitted
// from the CLMUL code.
# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
*/
# endif
#endif

Expand Down
Loading
Loading