Skip to content

Commit

Permalink
liblzma: Converted CRC_SIMD_BODY from macro to inline function
Browse files Browse the repository at this point in the history
  • Loading branch information
hansjans162 committed Sep 27, 2023
1 parent d2c9627 commit 90103ed
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 107 deletions.
4 changes: 3 additions & 1 deletion src/liblzma/check/crc32_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,9 @@ crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc)
__m128i vfold8 = _mm_set_epi64x(0, k5);
__m128i vfold16 = _mm_set_epi64x(k4, k3);

CRC_SIMD_BODY(_mm_cvtsi32_si128(~crc))
__m128i v0, v1, v2;

crc_simd_body(buf, size, &v0, &v1, vfold16, _mm_cvtsi32_si128(~crc));

v1 = _mm_xor_si128(
_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0
Expand Down
6 changes: 4 additions & 2 deletions src/liblzma/check/crc64_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,14 @@ crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
const __m128i vfold8 = _mm_set_epi64x(p, mu);
const __m128i vfold16 = _mm_set_epi64x(k2, k1);

__m128i v0, v1, v2;

#if defined(__i386__) || defined(_M_IX86)
CRC_SIMD_BODY(_mm_set_epi64x(0, ~crc))
crc_simd_body(buf, size, &v0, &v1, vfold16, _mm_set_epi64x(0, ~crc));
#else
// GCC and Clang would produce good code with _mm_set_epi64x
// but MSVC needs _mm_cvtsi64_si128 on x86-64.
CRC_SIMD_BODY(_mm_cvtsi64_si128(~crc))
crc_simd_body(buf, size, &v0, &v1, vfold16, _mm_cvtsi64_si128(~crc));
#endif

v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1);
Expand Down
227 changes: 123 additions & 104 deletions src/liblzma/check/crc_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,114 +126,133 @@ is_clmul_supported(void)
MASK_L(in, mask, low) MASK_H(in, mask, high)

#define FOLD \
v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold16, 0x00)); \
v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold16, 0x11));
*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x00)); \
*v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x11));

#define CRC_SIMD_LOOP \
while (aligned_buf < end) { \
FOLD \
v1 = _mm_load_si128(aligned_buf++); \
*v1 = _mm_load_si128(aligned_buf++); \
}

#define CRC_SIMD_BODY(crc2vec) \
/* Memory addresses A to D and the distances between them: \
\
A B C D \
[skip_start][size][skip_end] \
[ size2 ] \
\
A and D are 16-byte aligned. B and C are 1-byte aligned. \
skip_start and skip_end are 0-15 bytes. size is at least 1 byte. \
\
A = aligned_buf will initially point to this address. \
B = The address pointed by the caller-supplied buf. \
C = buf + size == aligned_buf + size2 \
D = buf + size + skip_end == aligned_buf + size2 + skip_end */\
uintptr_t skip_start = (uintptr_t)buf & 15; \
uintptr_t skip_end = -(uintptr_t)(buf + size) & 15; \
\
/* Create a vector with 8-bit values 0 to 15. \
This is used to construct control masks \
for _mm_blendv_epi8 and _mm_shuffle_epi8. */ \
__m128i vramp = _mm_setr_epi32( \
0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c); \
\
/* This is used to inverse the control mask of _mm_shuffle_epi8 \
so that bytes that wouldn't be picked with the original mask \
will be picked and vice versa. */ \
__m128i vsign = _mm_set1_epi8(-0x80); \
\
/* Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8:\
The first skip_start or skip_end bytes in the vectors will have\
the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8\
will produce zeros for these positions. (Bitwise-xor of these\
masks with vsign will produce the opposite behavior.) */ \
__m128i mask_start = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start)); \
__m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end)); \
\
/* If size2 <= 16 then the whole input fits into a single 16-byte \
vector. If size2 > 16 then at least two 16-byte vectors must \
be processed. If size2 > 16 && size <= 16 then there is only \
one 16-byte vector's worth of input but it is unaligned in memory. \
\
NOTE: There is no integer overflow here if the arguments \
are valid. If this overflowed, buf + size would too. */ \
uintptr_t size2 = skip_start + size; \
const __m128i *aligned_buf = (const __m128i*)((uintptr_t)buf & -16); \
__m128i v0, v1, v2, v3, vcrc, data0; \
\
vcrc = crc2vec; \
if (!size) return crc; \
\
/* Get the first 1-16 bytes into data0. If loading less than 16 \
bytes, the bytes are loaded to the high bits of the vector and \
the least significant positions are filled with zeros. */ \
data0 = _mm_load_si128(aligned_buf); \
data0 = _mm_blendv_epi8(data0, _mm_setzero_si128(), mask_start); \
aligned_buf++; \
if (size2 <= 16) { \
/* There are 1-16 bytes of input and it is all \
in data0. Copy the input bytes to v3. If there \
are fewer than 16 bytes, the low bytes in v3 \
will be filled with zeros. That is, the input \
bytes are stored to the same position as \
(part of) initial_crc is in v0. */ \
__m128i mask_low = _mm_add_epi8( \
vramp, _mm_set1_epi8(size - 16)); \
MASK_LH(vcrc, mask_low, v0, v1) \
MASK_L(data0, mask_end, v3) \
v0 = _mm_xor_si128(v0, v3); \
v1 = _mm_alignr_epi8(v1, v0, 8); \
} else { \
__m128i data1 = _mm_load_si128(aligned_buf); \
if (size <= 16) { \
/* Collect the 2-16 input bytes from data0 and data1 \
to v2 and v3, and bitwise-xor them with the \
low bits of initial_crc in v0. Note that the \
the second xor is below this else-block as it \
is shared with the other branch. */ \
__m128i mask_low = _mm_add_epi8( \
vramp, _mm_set1_epi8(size - 16)); \
MASK_LH(vcrc, mask_low, v0, v1); \
MASK_H(data0, mask_end, v2) \
MASK_L(data1, mask_end, v3) \
v0 = _mm_xor_si128(v0, v2); \
v0 = _mm_xor_si128(v0, v3); \
v1 = _mm_alignr_epi8(v1, v0, 8); \
} else { \
const __m128i *end = (const __m128i*)(\
(char*)aligned_buf++ - 16 + size2); \
MASK_LH(vcrc, mask_start, v0, v1) \
v0 = _mm_xor_si128(v0, data0); \
v1 = _mm_xor_si128(v1, data1); \
CRC_SIMD_LOOP \
if (aligned_buf != end) { \
MASK_H(v0, mask_end, v2) \
MASK_L(v0, mask_end, v0) \
MASK_L(v1, mask_end, v3) \
v1 = _mm_or_si128(v2, v3); \
} \
FOLD \
v1 = _mm_srli_si128(v0, 8); \
} \
#ifdef CRC_CLMUL

#include <immintrin.h>


#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
__attribute__((__target__("ssse3,sse4.1,pclmul")))
#endif
#if lzma_has_attribute(__no_sanitize_address__)
__attribute__((__no_sanitize_address__))
#endif
static inline void
crc_simd_body(const uint8_t *buf, size_t size, __m128i *v0, __m128i *v1,
__m128i vfold16, __m128i crc2vec)
{
#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
// Memory addresses A to D and the distances between them:
//
// A B C D
// [skip_start][size][skip_end]
// [ size2 ]
//
// A and D are 16-byte aligned. B and C are 1-byte aligned.
// skip_start and skip_end are 0-15 bytes. size is at least 1 byte.
//
// A = aligned_buf will initially point to this address.
// B = The address pointed by the caller-supplied buf.
// C = buf + size == aligned_buf + size2
// D = buf + size + skip_end == aligned_buf + size2 + skip_end
uintptr_t skip_start = (uintptr_t)buf & 15;
uintptr_t skip_end = -(uintptr_t)(buf + size) & 15;

// Create a vector with 8-bit values 0 to 15.
// This is used to construct control masks
// for _mm_blendv_epi8 and _mm_shuffle_epi8.
__m128i vramp = _mm_setr_epi32(
0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);

// This is used to inverse the control mask of _mm_shuffle_epi8
// so that bytes that wouldn't be picked with the original mask
// will be picked and vice versa.
__m128i vsign = _mm_set1_epi8(-0x80);

// Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8
// The first skip_start or skip_end bytes in the vectors will hav
// the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi
// will produce zeros for these positions. (Bitwise-xor of thes
// masks with vsign will produce the opposite behavior.)
__m128i mask_start = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start));
__m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end));

// If size2 <= 16 then the whole input fits into a single 16-byte
// vector. If size2 > 16 then at least two 16-byte vectors must
// be processed. If size2 > 16 && size <= 16 then there is only
// one 16-byte vector's worth of input but it is unaligned in memory.
//
// NOTE: There is no integer overflow here if the arguments
// are valid. If this overflowed, buf + size would too.
uintptr_t size2 = skip_start + size;
const __m128i *aligned_buf = (const __m128i*)((uintptr_t)buf & -16);
__m128i v2, v3, vcrc, data0;

vcrc = crc2vec;

// Get the first 1-16 bytes into data0. If loading less than 16
// bytes, the bytes are loaded to the high bits of the vector and
// the least significant positions are filled with zeros.
data0 = _mm_load_si128(aligned_buf);
data0 = _mm_blendv_epi8(data0, _mm_setzero_si128(), mask_start);
aligned_buf++;
if (size2 <= 16) {
// There are 1-16 bytes of input and it is all
// in data0. Copy the input bytes to v3. If there
// are fewer than 16 bytes, the low bytes in v3
// will be filled with zeros. That is, the input
// bytes are stored to the same position as
// (part of) initial_crc is in v0.
__m128i mask_low = _mm_add_epi8(
vramp, _mm_set1_epi8(size - 16));
MASK_LH(vcrc, mask_low, *v0, *v1)
MASK_L(data0, mask_end, v3)
*v0 = _mm_xor_si128(*v0, v3);
*v1 = _mm_alignr_epi8(*v1, *v0, 8);
} else {
__m128i data1 = _mm_load_si128(aligned_buf);
if (size <= 16) {
// Collect the 2-16 input bytes from data0 and data1
// to v2 and v3, and bitwise-xor them with the
// low bits of initial_crc in v0. Note that the
// the second xor is below this else-block as it
// is shared with the other branch.
__m128i mask_low = _mm_add_epi8(
vramp, _mm_set1_epi8(size - 16));
MASK_LH(vcrc, mask_low, *v0, *v1);
MASK_H(data0, mask_end, v2)
MASK_L(data1, mask_end, v3)
*v0 = _mm_xor_si128(*v0, v2);
*v0 = _mm_xor_si128(*v0, v3);
*v1 = _mm_alignr_epi8(*v1, *v0, 8);
} else {
const __m128i *end = (const __m128i*)(
(char*)aligned_buf++ - 16 + size2);
MASK_LH(vcrc, mask_start, *v0, *v1)
*v0 = _mm_xor_si128(*v0, data0);
*v1 = _mm_xor_si128(*v1, data1);
CRC_SIMD_LOOP
if (aligned_buf != end) {
MASK_H(*v0, mask_end, v2)
MASK_L(*v0, mask_end, *v0)
MASK_L(*v1, mask_end, v3)
*v1 = _mm_or_si128(v2, v3);
}
FOLD
*v1 = _mm_srli_si128(*v0, 8);
}
}
}
#endif

0 comments on commit 90103ed

Please sign in to comment.