Vers 2.029

Updated with Spout 2.007.001 files
leadedge · Jul 25, 2023 · e0d47fc · e0d47fc
1 parent 28ab85b
commit e0d47fc
Show file tree

Hide file tree

Showing 24 changed files with 752 additions and 293 deletions.
diff --git a/SpoutDX/source/SpoutCommon.h b/SpoutDX/source/SpoutCommon.h
@@ -34,8 +34,12 @@
 		LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 		OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+03.07.23	- Remove _MSC_VER condition from SPOUT_DLLEXP define
+			  (#PR93  Fix MinGW error (beta branch)
+
 
 */
+
 #pragma once
 
 #ifndef __SpoutCommon__
@@ -46,17 +50,13 @@
 // SPOUT_BUILD_DLL in the preprocessor defines.
 // Properties > C++ > Preprocessor > Preprocessor Definitions
 //
-#if defined(_MSC_VER)
-	#if defined(SPOUT_BUILD_DLL)
-		#define SPOUT_DLLEXP	__declspec(dllexport)
-	#elif defined(SPOUT_IMPORT_DLL)
-		#define SPOUT_DLLEXP	__declspec(dllimport)
-	#else
-		#define SPOUT_DLLEXP
-	#endif
-#else // _MSC_VER
+#if defined(SPOUT_BUILD_DLL)
+	#define SPOUT_DLLEXP	__declspec(dllexport)
+#elif defined(SPOUT_IMPORT_DLL)
+	#define SPOUT_DLLEXP	__declspec(dllimport)
+#else
 	#define SPOUT_DLLEXP
-#endif // _MSC_VERR
+#endif
 
 // Common utility functions namespace
 #include "SpoutUtils.h"

diff --git a/SpoutDX/source/SpoutCopy.cpp b/SpoutDX/source/SpoutCopy.cpp
@@ -61,6 +61,14 @@
 	11.12.22 - test for null args in conversion functions
 	22.12.22 - Compiler compatibility check
 			   Change all {} initializations to "={}"
+	02-04-23 - Corrected source pointer increment in rgba2rgba when not inverted
+	Version 2.007.11
+	17.04.23 - Add rgba_to_rgb_sse and rgba_to_bgr_sse
+	18.04.23 - Rename to rgba_to_rgb_sse3 and rgba_to_bgr_sse3
+			   Add bSwapRG to rgba_to_rgb_sse3
+			   Remove rgba_to_bgr_sse3
+			   Add experimental rgb_to_bgra_sse3
+	Version 2.007.012
 
 */
 
@@ -298,8 +306,8 @@ void spoutCopy::rgba2rgba(const void* rgba_source, void* rgba_dest,
 		// first
 		// Casting first avoids warning C26451: Arithmetic overflow with VS2022 code review
 		// https://docs.microsoft.com/en-us/visualstudio/code-quality/c26451
-		unsigned long YxW   = (unsigned long)(y * width);
-		const unsigned long YxSP4 = (unsigned long)sourcePitch / 4;
+		unsigned long YxW            = (unsigned long)(y * width);
+		const unsigned long YxSP4    = (unsigned long)(y * sourcePitch / 4);
 		const unsigned long InvYxSP4 = (unsigned long)((height - 1 - y) * sourcePitch / 4);
 
 		// Increment to current line
@@ -524,7 +532,8 @@ void spoutCopy::bgra2rgba(const void *bgra_source, void *rgba_dest, unsigned int
 
 //---------------------------------------------------------
 // Function: rgba2rgb
-// Copy RGBA to RGB allowing for source line pitch
+// Copy RGBA to RGB or BGR allowing for source line pitch using the fastest method
+//
 void spoutCopy::rgba2rgb(const void* rgba_source, void* rgb_dest,
 	unsigned int width, unsigned int height,
 	unsigned int rgba_pitch, bool bInvert, bool bMirror, bool bSwapRB) const
@@ -535,6 +544,33 @@ void spoutCopy::rgba2rgb(const void* rgba_source, void* rgb_dest,
 	if (!rgb || !rgba)
 		return;
 
+	//
+	// SSE3 copy
+	// No mirror option, image size 16 bit byte aligned, SSE3 intrinsics support
+	//
+	// Timing tests show more than twice as fast
+	// (Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz)
+	//
+	// SSE
+	//   1280x720    1.6 msec
+	//   1920x1080   3.7 msec
+	//   3840x2160  14.0 msec
+	// Byte copy
+	//   1280x720    4.0 msec
+	//   1920x1080   9.3 msec
+	//   3840x2160  35.9 msec
+	//
+	unsigned int pitch = rgba_pitch;
+	if(pitch == 0) pitch = width*4;
+	if (!bMirror && width >= 320 && (width % 16) == 0 && m_bSSE3) {
+		rgba_to_rgb_sse3(rgba_source, rgb_dest, width, height, pitch, bInvert, bSwapRB);
+		return;
+	}
+
+	//
+	// Byte pointer copy
+	//
+
 	// RGB dest does not have padding
 	uint64_t rgbsize = (uint64_t)width * (uint64_t)height * 3;
 	uint64_t rgbpitch = (uint64_t)width * 3;
@@ -580,6 +616,7 @@ void spoutCopy::rgba2rgb(const void* rgba_source, void* rgb_dest,
 
 } // end rgba2rgb
 
+
 //---------------------------------------------------------
 // Function: rgb2rgba
 //
@@ -814,35 +851,37 @@ void spoutCopy::rgb2bgra(const void *rgb_source, void *bgra_dest,
 } // end rgb2bgra
 
 
-// Experimental
-//
-// Testing shows gains of 1 fps
-// TODO : exact frame timing
+// =====================================================================================
 //
+// Experimental
 // https://exchangetuts.com/fast-vectorized-conversion-from-rgb-to-bgra-1640844423877396
-//
 // in and out must be 16-byte aligned
+// See also SIMD library
+// https://github.com/ermig1979/Simd
+//
 
 //---------------------------------------------------------
 // Function: rgb_to_bgrx_sse
 // Experimental pending testing
-void spoutCopy::rgb_to_bgrx_sse(unsigned int w, const void* inpix, void* outpix) const
+// Single line function
+void spoutCopy::rgb_to_bgrx_sse(unsigned int npixels, const void* rgb_source, void* bgrx_dest) const
 {
-	const __m128i* in_vec = static_cast<const __m128i*>(inpix);
-	__m128i* out_vec = static_cast<__m128i*>(outpix);
+	const __m128i* in_vec = static_cast<const __m128i*>(rgb_source);
+	__m128i* out_vec = static_cast<__m128i*>(bgrx_dest);
 
 	if (!in_vec || !out_vec)
 		return;
 
-	w /= 16;
+	npixels /= 16;
 
-	while (w-- > 0) {
+	while (npixels-- > 0) {
+
+		//             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
+		// in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
+		// in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
+		// in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
+		//
 
-		/*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
-		 * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
-		 * in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
-		 * in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
-		 */
 		__m128i in1={};
 		__m128i in2={};
 		__m128i in3={};
@@ -894,6 +933,214 @@ void spoutCopy::rgb_to_bgrx_sse(unsigned int w, const void* inpix, void* outpix)
 
 } // end rgb_to_bgrx_sse
 
+
+//---------------------------------------------------------
+// Function: rgb_to_bgra_sse
+// Experimental pending testing
+// Full image height
+void spoutCopy::rgb_to_bgra_sse3 (
+	void* rgb_source, 
+	void* rgba_dest,
+	unsigned int width, 
+	unsigned int height) const
+{
+	if ((width % 16) != 0)
+		return;
+
+	auto rgb = static_cast<const unsigned char*>(rgb_source); // rgb/bgr
+	auto rgba = static_cast<unsigned char*>(rgba_dest); // rgba/bgra
+	if (!rgb || !rgba)
+		return;
+
+	for (unsigned int y = 0; y < height-2; y++) {
+		rgb_to_bgrx_sse(width*3, rgb, rgba);
+		rgb  += width*3;
+		rgba += width*4;
+	}
+
+} // end rgb_to_bgra_sse3
+
+//
+// =====================================================================================
+
+
+//---------------------------------------------------------
+// Function: rgba_to_rgb_sse3
+//
+void spoutCopy::rgba_to_rgb_sse3(const void* rgba_source, void* rgb_dest,
+	unsigned int width, unsigned int height, unsigned int rgba_pitch,
+	bool bInvert, bool bSwapRB) const
+{
+	const __m128i* in_vec = static_cast<const __m128i*>(rgba_source); // rgba
+	__m128i* out_vec = static_cast<__m128i*>(rgb_dest); // rgb
+
+	if (!out_vec || !in_vec)
+		return;
+
+	// RGB dest does not have padding
+	unsigned int rgbsize = width*height*3;
+	unsigned int rgbpitch = width*3;
+
+	// Dest and source must be the same dimensions otherwise
+	unsigned int rgba_padding = rgba_pitch-width*4; // byte line padding
+
+	// Flip image option, move to the beginning of the last rgb line
+	if (bInvert) {
+		out_vec += rgbsize/16; // end of rgb buffer
+		out_vec -= rgbpitch/16; // beginning of the last rgb line
+	}
+
+	unsigned int w = width/16;
+	for (unsigned int y = 0; y < height; y++) {
+
+		while (w-- > 0) {
+
+			__m128i in0={};
+			__m128i in1={};
+			__m128i in2={};
+			__m128i in3={};
+
+			__m128i out0={};
+			__m128i out1={};
+
+			in0 = in_vec[0];   // First 128 bits RGBA
+			in1 = in_vec[1];   // Second 128 bits RGBA
+			in2 = in_vec[2];   // Third 128 bits RGBA
+			in3 = in_vec[3];   // Fourth 128 bits RGBA
+
+			if (!bSwapRB) {
+
+				// RGBA > RGB
+
+				//
+				// RGBA in 4x16 = 64 bytes (4 pixels at 4 bytes each = 16 bytes)
+				//               0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+				// in_vec[0]    Ra Ga Ba Xa Rb Gb Bb Xb Rc Gc Bc Xc Rd Gd Bd Xd
+				// in_vec[1]    Re Ge Be Xe Rf Gf Bf Xf Rg Gg Bg Xg Rh Gh Bh Xh
+				// in_vec[2]    Ri Gi Bi Xi Rj Gj Bj Xj Rk Gk Bk Xk Rl Gl Bl Xl
+				// in_vec[3]    Rm Gm Bm Xm Rn Gn Bn Xn Ro Go Bo Xo Rp Gp Bp Xp
+
+				//
+				// RGB out  3x16 = 48 bytes (4 pixels at 3 bytes each = 12 bytes)
+				//
+				// out_vec[0]    Rf Be Ge Re Bd Gd Rd Bc Gc Rc Bb Gb Rb Ba Ga Ra
+				//                4  2  1  0 14 13 12 10  9  8  6  5  4  2  1  0
+				//                 in_vec[1] |            in_vec[0]            |
+				// out_vec[1]    Gk Rk Bj Gj Rj Bi Gi Ri Bh Gh Rh Bg Gg Rg Bf Gf
+				//                9  8  6  5  4  2  1  0 14 13 12 10  9  8  6  5
+				//                        in_vec[2]       |     in_vec[1]
+				// out_vec[2]    Bp Gp Rp Bo Go Ro Bn Gn Rn Bm Gm Rm Bl Gl Rl Bk
+				//               14 13 12 10  9  8  6  5  4  2  1  0 14 13 12 10  
+				//                        in_vec[3]                   | in_vec[2]
+
+				// First 16 RGB bytes
+				// out_vec[0]    Rf Be Ge Re Bd Gd Rd Bc Gc Rc Bb Gb Rb Ba Ga Ra
+				//                4  2  1  0 14 13 12 10  9  8  6  5  4  2  1  0
+				//                 in_vec[1] |            in_vec[0]            |
+				out0 = _mm_shuffle_epi8(in0,
+					_mm_set_epi8('\xff', '\xff', '\xff', '\xff', 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0));
+				out1 = _mm_shuffle_epi8(in1,
+					_mm_set_epi8(4, 2, 1, 0, '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'));
+				out_vec[0] = _mm_or_si128(out0, out1);
+
+				// Second 16 RGB bytes
+				// out_vec[1]    Gk Rk Bj Gj Rj Bi Gi Ri Bh Gh Rh Bg Gg Rg Bf Gf
+				//                9  8  6  5  4  2  1  0 14 13 12 10  9  8  6  5
+				//                        in_vec[2]       |     in_vec[1]
+				out0 = _mm_shuffle_epi8(in1,
+					_mm_set_epi8('\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', 14, 13, 12, 10, 9, 8, 6, 5));
+				out1 = _mm_shuffle_epi8(in2,
+					_mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'));
+				out_vec[1] = _mm_or_si128(out0, out1);
+
+				// Third 16 RGB bytes
+				// out_vec[2]    Bp Gp Rp Bo Go Ro Bn Gn Rn Bm Gm Rm Bl Gl Rl Bk
+				//               14 13 12 10  9  8  6  5  4  2  1  0 14 13 12 10  
+				//                        in_vec[3]                   | in_vec[2]
+				out0 = _mm_shuffle_epi8(in2,
+					_mm_set_epi8('\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', 14, 13, 12, 10));
+				out1 = _mm_shuffle_epi8(in3,
+					_mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, '\xff', '\xff', '\xff', '\xff'));
+				out_vec[2] = _mm_or_si128(out0, out1);
+			}  // end RGBA >RGB
+			else {
+
+				// RGBA > BGR
+
+				//
+				// RGBA in 4x16 = 64 bytes (4 pixels at 4 bytes each = 16 bytes)
+				//               0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+				// in_vec[0]    Ra Ga Ba Xa Rb Gb Bb Xb Rc Gc Bc Xc Rd Gd Bd Xd
+				// in_vec[1]    Re Ge Be Xe Rf Gf Bf Xf Rg Gg Bg Xg Rh Gh Bh Xh
+				// in_vec[2]    Ri Gi Bi Xi Rj Gj Bj Xj Rk Gk Bk Xk Rl Gl Bl Xl
+				// in_vec[3]    Rm Gm Bm Xm Rn Gn Bn Xn Ro Go Bo Xo Rp Gp Bp Xp
+
+				//
+				// BGR out  3x16 = 48 bytes (4 pixels at 3 bytes each = 12 bytes)
+				//
+				// out_vec[0]    Bf Re Ge Be Rd Gd Bd Rc Gc Bc Rb Gb Bb Ra Ga Ba
+				//                6  0  1  2 12 13 14  8  9 10  4  5  6  0  1  2
+				//                 in_vec[1] |            in_vec[0]            |
+				// out_vec[1]    Gk Bk Rj Gj Bj Ri Gi Bi Rh Gh Bh Rg Gg Bg Rf Gf
+				//                9 10  4  5  6  0  1  2 12 13 14  8  9 10  4  5
+				//                        in_vec[2]       |     in_vec[1]
+				// out_vec[2]    Rp Gp Bp Ro Go Bo Rn Gn Bn Rm Gm Bm Rl Gl Bl Rk
+				//               12 13 14  8  9 10  4  5  6  0  1  2 12 13 14  8  
+				//                        in_vec[3]                   | in_vec[2]
+
+				// First 16 BGR bytes
+				// out_vec[0]    Rf Be Ge Re Bd Gd Rd Bc Gc Rc Bb Gb Rb Ba Ga Ra
+				//                6  0  1  2 12 13 14  8  9 10  4  5  6  0  1  2
+				//                 in_vec[1] |            in_vec[0]            |
+				out0 = _mm_shuffle_epi8(in0,
+					_mm_set_epi8('\xff', '\xff', '\xff', '\xff', 12, 13, 14, 8, 9, 10, 4, 5, 6, 0, 1, 2));
+				out1 = _mm_shuffle_epi8(in1,
+					_mm_set_epi8(6, 0, 1, 2, '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'));
+				out_vec[0] = _mm_or_si128(out0, out1);
+
+				// Second 16 BGR bytes
+				// out_vec[1]    Gk Rk Bj Gj Rj Bi Gi Ri Bh Gh Rh Bg Gg Rg Bf Gf
+				//                9 10  4  5  6  0  1  2 12 13 14  8  9 10  4  5
+				//                        in_vec[2]      |     in_vec[1]
+				out0 = _mm_shuffle_epi8(in1,
+					_mm_set_epi8('\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', 12, 13, 14, 8, 9, 10, 4, 5));
+				out1 = _mm_shuffle_epi8(in2,
+					_mm_set_epi8(9, 10, 4, 5, 6, 0, 1, 2, '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'));
+				out_vec[1] = _mm_or_si128(out0, out1);
+
+				// Third 16 BGR bytes
+				// out_vec[2]    Bp Gp Rp Bo Go Ro Bn Gn Rn Bm Gm Rm Bl Gl Rl Bk
+				//               12 13 14  8  9 10  4  5  6  0  1  2 12 13 14  8  
+				//                        in_vec[3]                  | in_vec[2]
+				out0 = _mm_shuffle_epi8(in2,
+					_mm_set_epi8('\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', 12, 13, 14, 8));
+				out1 = _mm_shuffle_epi8(in3,
+					_mm_set_epi8(12, 13, 14, 8, 9, 10, 4, 5, 6, 0, 1, 2, '\xff', '\xff', '\xff', '\xff'));
+				out_vec[2] = _mm_or_si128(out0, out1);
+
+			} // end RGBA > BGR
+
+			in_vec  += 4; // RGBA 4x16
+			out_vec += 3; // RGB  3x16
+
+		} // done the line
+
+		// Reset width
+		w = width/16;
+
+		// Allow for padding at the end of the rgba source line
+		in_vec += rgba_padding/16;
+
+		// move up a line for invert
+		if (bInvert) {
+			out_vec -= w*3*2;
+		}
+
+	}
+
+} // end rgba_to_rgb_sse
+
+
 //---------------------------------------------------------
 // Function: bgr2bgra
 //
@@ -1040,7 +1287,7 @@ void spoutCopy::rgba2rgbResample(const void* source, void* dest,
 
 			if (bMirror) {
 				if (bInvert)
-					pixel = (destHeight - i - 1)*destWidth * 3 + (destWidth - j - 1) * 3; // flip vertically
+					pixel = (destHeight - i - 1)*destWidth * 3 + (destWidth - j - 1) * 3; // flip horizontally
 				else
 					pixel = i * destWidth * 3 + (destWidth - j - 1) * 3;
 			}