From badba740724462475ce65c42eb6a39bdab1e8065 Mon Sep 17 00:00:00 2001 From: zomfg <239811+zomfg@users.noreply.github.com> Date: Tue, 31 Oct 2023 20:35:50 +0100 Subject: [PATCH] grab: `AVX512` support --- Software/grab/calculations.cpp | 88 ++++++++++++++++++++++++++++++++-- Software/grab/grab.pro | 6 +-- 2 files changed, 86 insertions(+), 8 deletions(-) diff --git a/Software/grab/calculations.cpp b/Software/grab/calculations.cpp index 99d2f753..e5918bbe 100644 --- a/Software/grab/calculations.cpp +++ b/Software/grab/calculations.cpp @@ -89,7 +89,7 @@ auto accumulateABGR = accumulateBuffer; auto accumulateRGBA = accumulateBuffer; auto accumulateBGRA = accumulateBuffer; -#if defined(__SSE4_1__) || defined(__AVX2__) +#if defined(__SSE4_1__) || defined(__AVX2__) || (defined(__AVX512F__) && defined(__AVX512BW__)) #ifdef __SSE4_1__ template static ColorValue accumulateBuffer128( @@ -245,11 +245,69 @@ auto accumulateBGRA = accumulateBuffer; return color; }; #endif // ifdef __AVX2__ +#if defined(__AVX512F__) && defined(__AVX512BW__) + template + static ColorValue accumulateBuffer512( + const int * const buffer, + const size_t pitch, + const QRect& rect) { + + __m512i sum[bytesPerPixel] = { + _mm512_setzero_epi32(), + _mm512_setzero_epi32(), + _mm512_setzero_epi32(), + _mm512_setzero_epi32() + }; // A,R,G,B sums + + constexpr const uint32_t zero = (1 << 7); + + const __m512i shuffleR = _mm512_set4_epi32( + (zero << 24) | (zero << 16) | (zero << 8) | (3*4+offsetR), + (zero << 24) | (zero << 16) | (zero << 8) | (2*4+offsetR), + (zero << 24) | (zero << 16) | (zero << 8) | (1*4+offsetR), + (zero << 24) | (zero << 16) | (zero << 8) | (0*4+offsetR) + ); + const __m512i shuffleG = _mm512_set4_epi32( + (zero << 24) | (zero << 16) | (zero << 8) | (3*4+offsetG), + (zero << 24) | (zero << 16) | (zero << 8) | (2*4+offsetG), + (zero << 24) | (zero << 16) | (zero << 8) | (1*4+offsetG), + (zero << 24) | (zero << 16) | (zero << 8) | (0*4+offsetG) + ); + const __m512i shuffleB = _mm512_set4_epi32( + (zero << 24) | (zero << 16) | (zero << 8) | (3*4+offsetB), + (zero << 24) | (zero << 16) | (zero << 8) | (2*4+offsetB), + (zero << 24) | (zero << 16) | (zero << 8) | (1*4+offsetB), + (zero << 24) | (zero << 16) | (zero << 8) | (0*4+offsetB) + ); + + constexpr const int stepsPerLoad = 4; + constexpr const int pixelsPerLoad = pixelsPerStep * stepsPerLoad; + const size_t softlimit = rect.width() / pixelsPerLoad; + const size_t delta = (size_t)rect.width() - (softlimit * pixelsPerLoad); + const __mmask16 deltamask = 0xFFFF >> (16 - delta); + for (size_t currentY = 0; currentY < (size_t)rect.height(); ++currentY) { + for (size_t currentX = 0; currentX <= softlimit; ++currentX) { + const size_t index = pitch * (rect.y() + currentY) + rect.x() + currentX * pixelsPerLoad; // starting offset for lines + const __m512i vec8 = _mm512_maskz_loadu_epi32(currentX == softlimit ? deltamask : __mmask16(0xFFFF), &buffer[index]); + sum[offsetR] = _mm512_add_epi32(sum[offsetR], _mm512_shuffle_epi8(vec8, shuffleR)); + sum[offsetG] = _mm512_add_epi32(sum[offsetG], _mm512_shuffle_epi8(vec8, shuffleG)); + sum[offsetB] = _mm512_add_epi32(sum[offsetB], _mm512_shuffle_epi8(vec8, shuffleB)); + } + } + const size_t count = rect.height() * rect.width(); + ColorValue color; + color.r = (_mm512_reduce_add_epi32(sum[offsetR]) / count) & 0xff; + color.g = (_mm512_reduce_add_epi32(sum[offsetG]) / count) & 0xff; + color.b = (_mm512_reduce_add_epi32(sum[offsetB]) / count) & 0xff; + return color; + }; +#endif // ifdef __AVX512F__ && __AVX512BW__ enum SIMDLevel { None = 0, SSE4_1 = 1 << 0, - AVX2 = 1 << 1 + AVX2 = 1 << 1, + AVX512 = 1 << 2 }; #if defined(Q_OS_MACOS) @@ -271,6 +329,8 @@ static uint32_t available_simd() { // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family static uint32_t available_simd() { uint32_t level = SIMDLevel::None; + if (_may_i_use_cpu_feature(_FEATURE_AVX512F | _FEATURE_AVX512BW)) + level |= SIMDLevel::AVX512; if (_may_i_use_cpu_feature(_FEATURE_AVX2)) level |= SIMDLevel::AVX2; if (_may_i_use_cpu_feature(_FEATURE_SSE4_1)) @@ -321,8 +381,13 @@ static uint32_t available_simd() { abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx; #endif // ifdef _MSC_VER uint32_t level = SIMDLevel::None; - // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 run_cpuid(7, 0, abcd); + // CPUID.(EAX=07H, ECX=0H):EBX.AVX512F [bit 16]==1 + // CPUID.(EAX=07H, ECX=0H):EBX.AVX512BW[bit 30]==1 + if ((abcd[1] & (1 << 16)) && (abcd[1] & (1 << 30))) + level |= SIMDLevel::AVX512; + + // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 if ((abcd[1] & (1 << 5))) level |= SIMDLevel::AVX2; @@ -344,7 +409,12 @@ static uint32_t available_simd() { SSE4.1 97.88% / +0.69% AVX2 74.19% / +2.73% - by default set functions to non-SIMD and upgrade to AVX2 or SSE4.1 when available + (September 2023) + SSE4.1 99.55% / +0.08% + AVX2 92.04% / +0.80% + AVX512F 10.00% / -0.15% + + by default set functions to non-SIMD and upgrade to AVX2/512 or SSE4.1 when available */ struct simdupgrade { simdupgrade() { @@ -365,10 +435,18 @@ struct simdupgrade { accumulateBGRA = accumulateBuffer256; } #endif // ifdef __AVX2__ + #if defined(__AVX512F__) && defined(__AVX512BW__) + if (level & SIMDLevel::AVX512) { + accumulateARGB = accumulateBuffer512; + accumulateABGR = accumulateBuffer512; + accumulateRGBA = accumulateBuffer512; + accumulateBGRA = accumulateBuffer512; + } + #endif // ifdef __AVX512F__ && __AVX512BW__ } }; simdupgrade avxup; -#endif // ifdef __SSE4_1__ || __AVX2__ +#endif // ifdef __SSE4_1__ || __AVX2__ || (__AVX512F__ && __AVX512BW__) } // namespace namespace Grab { diff --git a/Software/grab/grab.pro b/Software/grab/grab.pro index 61666317..ef2a9f1e 100644 --- a/Software/grab/grab.pro +++ b/Software/grab/grab.pro @@ -106,9 +106,9 @@ win32 { LIBS += -lprismatik-hooks -llibraryinjector -lprismatik-unhook # emulate every other compiler, __SSE4_1__ is defined when AVX2 is enabled (and __AVX2__ is also defined) - DEFINES += __SSE4_1__ __AVX2__ + DEFINES += __SSE4_1__ __AVX2__ __AVX512F__ __AVX512BW__ # causes global vectorization, enable if your target CPU has AVX2 - # QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_AVX2 + # QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_AVX2 $$QMAKE_CFLAGS_AVX512F $$QMAKE_CFLAGS_AVX512BW } contains(DEFINES,NIGHTLIGHT_SUPPORT) { @@ -149,7 +149,7 @@ macx { unix:!macx { CXX_TARGET = $$system($$QMAKE_CXX -dumpmachine) contains(CXX_TARGET, x86_64.*) { - QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_AVX2 + QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_AVX2 $$QMAKE_CFLAGS_AVX512F $$QMAKE_CFLAGS_AVX512BW } }