liblzma: Converted CRC_SIMD_BODY from macro to inline function

tukaani-project · Sep 27, 2023 · 90103ed · 90103ed
1 parent d2c9627
commit 90103ed
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 107 deletions.
diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c
@@ -194,7 +194,9 @@ crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc)
 	__m128i vfold8 = _mm_set_epi64x(0, k5);
 	__m128i vfold16 = _mm_set_epi64x(k4, k3);
 
-	CRC_SIMD_BODY(_mm_cvtsi32_si128(~crc))
+	__m128i v0, v1, v2;
+
+	crc_simd_body(buf,  size, &v0, &v1, vfold16, _mm_cvtsi32_si128(~crc));
 
 	v1 = _mm_xor_si128(
 			_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0

diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c
@@ -188,12 +188,14 @@ crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
 	const __m128i vfold8 = _mm_set_epi64x(p, mu);
 	const __m128i vfold16 = _mm_set_epi64x(k2, k1);
 
+	__m128i v0, v1, v2;
+
 #if defined(__i386__) || defined(_M_IX86)
-	CRC_SIMD_BODY(_mm_set_epi64x(0, ~crc))
+	crc_simd_body(buf,  size, &v0, &v1, vfold16, _mm_set_epi64x(0, ~crc));
 #else
 	// GCC and Clang would produce good code with _mm_set_epi64x
 	// but MSVC needs _mm_cvtsi64_si128 on x86-64.
-	CRC_SIMD_BODY(_mm_cvtsi64_si128(~crc))
+	crc_simd_body(buf,  size, &v0, &v1, vfold16, _mm_cvtsi64_si128(~crc));
 #endif
 
 	v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1);

diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h
@@ -126,114 +126,133 @@ is_clmul_supported(void)
 	MASK_L(in, mask, low) MASK_H(in, mask, high)
 
 #define FOLD \
-	v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold16, 0x00)); \
-	v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold16, 0x11));
+	*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x00)); \
+	*v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x11));
 
 #define CRC_SIMD_LOOP \
 	while (aligned_buf < end) { \
 		FOLD \
-		v1 = _mm_load_si128(aligned_buf++); \
+		*v1 = _mm_load_si128(aligned_buf++); \
 	}
 
-#define CRC_SIMD_BODY(crc2vec) \
-        /* Memory addresses A to D and the distances between them: \
-	   \
-	       A           B     C         D \
-	       [skip_start][size][skip_end] \
-	       [     size2      ] \
-	   \
-	   A and D are 16-byte aligned. B and C are 1-byte aligned. \
-	   skip_start and skip_end are 0-15 bytes. size is at least 1 byte. \
-	   \
-	   A = aligned_buf will initially point to this address. \
-	   B = The address pointed by the caller-supplied buf. \
-	   C = buf + size == aligned_buf + size2 \
-	   D = buf + size + skip_end == aligned_buf + size2 + skip_end */\
-	uintptr_t skip_start = (uintptr_t)buf & 15; \
-	uintptr_t skip_end = -(uintptr_t)(buf + size) & 15; \
-        \
-        /* Create a vector with 8-bit values 0 to 15. \
-           This is used to construct control masks \
-           for _mm_blendv_epi8 and _mm_shuffle_epi8. */ \
-	__m128i vramp = _mm_setr_epi32( \
-                        0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c); \
-        \
-        /* This is used to inverse the control mask of _mm_shuffle_epi8 \
-	   so that bytes that wouldn't be picked with the original mask \
-	   will be picked and vice versa. */ \
-	__m128i vsign = _mm_set1_epi8(-0x80); \
-        \
-        /* Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8:\
-	   The first skip_start or skip_end bytes in the vectors will have\
-	   the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8\
-	   will produce zeros for these positions. (Bitwise-xor of these\
-	   masks with vsign will produce the opposite behavior.) */ \
-	__m128i mask_start = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start)); \
-	__m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end)); \
-	\
-        /* If size2 <= 16 then the whole input fits into a single 16-byte \
-	   vector. If size2 > 16 then at least two 16-byte vectors must \
-	   be processed. If size2 > 16 && size <= 16 then there is only \
-	   one 16-byte vector's worth of input but it is unaligned in memory. \
-	   \
-	   NOTE: There is no integer overflow here if the arguments \
-	   are valid. If this overflowed, buf + size would too. */ \
-	uintptr_t size2 = skip_start + size; \
-	const __m128i *aligned_buf = (const __m128i*)((uintptr_t)buf & -16); \
-	__m128i v0, v1, v2, v3, vcrc, data0; \
-	\
-	vcrc = crc2vec; \
-	if (!size) return crc; \
-        \
-        /* Get the first 1-16 bytes into data0. If loading less than 16 \
-           bytes, the bytes are loaded to the high bits of the vector and \
-           the least significant positions are filled with zeros. */ \
-	data0 = _mm_load_si128(aligned_buf); \
-	data0 = _mm_blendv_epi8(data0, _mm_setzero_si128(), mask_start); \
-	aligned_buf++; \
-	if (size2 <= 16) { \
-                /* There are 1-16 bytes of input and it is all \
-                   in data0. Copy the input bytes to v3. If there \
-                   are fewer than 16 bytes, the low bytes in v3 \
-                   will be filled with zeros. That is, the input \
-                   bytes are stored to the same position as \
-                   (part of) initial_crc is in v0. */ \
-		__m128i mask_low = _mm_add_epi8( \
-				vramp, _mm_set1_epi8(size - 16)); \
-		MASK_LH(vcrc, mask_low, v0, v1) \
-		MASK_L(data0, mask_end, v3) \
-		v0 = _mm_xor_si128(v0, v3); \
-		v1 = _mm_alignr_epi8(v1, v0, 8); \
-	} else { \
-		__m128i data1 = _mm_load_si128(aligned_buf); \
-		if (size <= 16) { \
-                        /* Collect the 2-16 input bytes from data0 and data1 \
-			   to v2 and v3, and bitwise-xor them with the \
-			   low bits of initial_crc in v0. Note that the \
-			   the second xor is below this else-block as it \
-			   is shared with the other branch. */ \
-			__m128i mask_low = _mm_add_epi8( \
-					vramp, _mm_set1_epi8(size - 16)); \
-			MASK_LH(vcrc, mask_low, v0, v1); \
-			MASK_H(data0, mask_end, v2) \
-			MASK_L(data1, mask_end, v3) \
-			v0 = _mm_xor_si128(v0, v2); \
-			v0 = _mm_xor_si128(v0, v3); \
-			v1 = _mm_alignr_epi8(v1, v0, 8); \
-		} else { \
-			const __m128i *end = (const __m128i*)(\
-					(char*)aligned_buf++ - 16 + size2); \
-			MASK_LH(vcrc, mask_start, v0, v1) \
-			v0 = _mm_xor_si128(v0, data0); \
-			v1 = _mm_xor_si128(v1, data1); \
-			CRC_SIMD_LOOP \
-			if (aligned_buf != end) { \
-				MASK_H(v0, mask_end, v2) \
-				MASK_L(v0, mask_end, v0) \
-				MASK_L(v1, mask_end, v3) \
-				v1 = _mm_or_si128(v2, v3); \
-			} \
-			FOLD \
-			v1 = _mm_srli_si128(v0, 8); \
-		} \
+#ifdef CRC_CLMUL
+
+#include <immintrin.h>
+
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
+__attribute__((__target__("ssse3,sse4.1,pclmul")))
+#endif
+#if lzma_has_attribute(__no_sanitize_address__)
+__attribute__((__no_sanitize_address__))
+#endif
+static inline void
+crc_simd_body(const uint8_t *buf, size_t size, __m128i *v0, __m128i *v1,
+		__m128i vfold16, __m128i crc2vec)
+{
+#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
+#	pragma GCC diagnostic push
+#	pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+	// Memory addresses A to D and the distances between them:
+	//
+	//     A           B     C         D
+	//     [skip_start][size][skip_end]
+	//     [     size2      ]
+	//
+	// A and D are 16-byte aligned. B and C are 1-byte aligned.
+	// skip_start and skip_end are 0-15 bytes. size is at least 1 byte.
+	//
+	// A = aligned_buf will initially point to this address.
+	// B = The address pointed by the caller-supplied buf.
+	// C = buf + size == aligned_buf + size2
+	// D = buf + size + skip_end == aligned_buf + size2 + skip_end
+	uintptr_t skip_start = (uintptr_t)buf & 15;
+	uintptr_t skip_end = -(uintptr_t)(buf + size) & 15;
+
+	// Create a vector with 8-bit values 0 to 15.
+	// This is used to construct control masks
+	// for _mm_blendv_epi8 and _mm_shuffle_epi8.
+	__m128i vramp = _mm_setr_epi32(
+			0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);
+
+	// This is used to inverse the control mask of _mm_shuffle_epi8
+	// so that bytes that wouldn't be picked with the original mask
+	// will be picked and vice versa.
+	__m128i vsign = _mm_set1_epi8(-0x80);
+
+	// Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8
+	// The first skip_start or skip_end bytes in the vectors will hav
+	// the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi
+	// will produce zeros for these positions. (Bitwise-xor of thes
+	// masks with vsign will produce the opposite behavior.)
+	__m128i mask_start = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start));
+	__m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end));
+
+	// If size2 <= 16 then the whole input fits into a single 16-byte
+	// vector. If size2 > 16 then at least two 16-byte vectors must
+	// be processed. If size2 > 16 && size <= 16 then there is only
+	// one 16-byte vector's worth of input but it is unaligned in memory.
+	//
+	// NOTE: There is no integer overflow here if the arguments
+	// are valid. If this overflowed, buf + size would too.
+	uintptr_t size2 = skip_start + size;
+	const __m128i *aligned_buf = (const __m128i*)((uintptr_t)buf & -16);
+	__m128i v2, v3, vcrc, data0;
+
+	vcrc = crc2vec;
+
+	// Get the first 1-16 bytes into data0. If loading less than 16
+	// bytes, the bytes are loaded to the high bits of the vector and
+	// the least significant positions are filled with zeros.
+	data0 = _mm_load_si128(aligned_buf);
+	data0 = _mm_blendv_epi8(data0, _mm_setzero_si128(), mask_start);
+	aligned_buf++;
+	if (size2 <= 16) {
+		//  There are 1-16 bytes of input and it is all
+		//  in data0. Copy the input bytes to v3. If there
+		//  are fewer than 16 bytes, the low bytes in v3
+		//  will be filled with zeros. That is, the input
+		//  bytes are stored to the same position as
+		//  (part of) initial_crc is in v0.
+		__m128i mask_low = _mm_add_epi8(
+				vramp, _mm_set1_epi8(size - 16));
+		MASK_LH(vcrc, mask_low, *v0, *v1)
+		MASK_L(data0, mask_end, v3)
+		*v0 = _mm_xor_si128(*v0, v3);
+		*v1 = _mm_alignr_epi8(*v1, *v0, 8);
+	} else {
+		__m128i data1 = _mm_load_si128(aligned_buf);
+		if (size <= 16) {
+			//  Collect the 2-16 input bytes from data0 and data1
+			//  to v2 and v3, and bitwise-xor them with the
+			//  low bits of initial_crc in v0. Note that the
+			//  the second xor is below this else-block as it
+			//  is shared with the other branch.
+			__m128i mask_low = _mm_add_epi8(
+					vramp, _mm_set1_epi8(size - 16));
+			MASK_LH(vcrc, mask_low, *v0, *v1);
+			MASK_H(data0, mask_end, v2)
+			MASK_L(data1, mask_end, v3)
+			*v0 = _mm_xor_si128(*v0, v2);
+			*v0 = _mm_xor_si128(*v0, v3);
+			*v1 = _mm_alignr_epi8(*v1, *v0, 8);
+		} else {
+			const __m128i *end = (const __m128i*)(
+					(char*)aligned_buf++ - 16 + size2);
+			MASK_LH(vcrc, mask_start, *v0, *v1)
+			*v0 = _mm_xor_si128(*v0, data0);
+			*v1 = _mm_xor_si128(*v1, data1);
+			CRC_SIMD_LOOP
+			if (aligned_buf != end) {
+				MASK_H(*v0, mask_end, v2)
+				MASK_L(*v0, mask_end, *v0)
+				MASK_L(*v1, mask_end, v3)
+				*v1 = _mm_or_si128(v2, v3);
+			}
+			FOLD
+			*v1 = _mm_srli_si128(*v0, 8);
+		}
 	}
+}
+#endif