From 53fb3d313d1f044360ed52de2f3b34306fb90e02 Mon Sep 17 00:00:00 2001 From: Anthony Roberts Date: Thu, 7 Nov 2024 09:17:53 +0000 Subject: [PATCH 1/4] Add support for Windows ARM64 Signed-off-by: Anthony Roberts --- CMakeLists.txt | 8 +++++--- share/cmake/modules/install/Installsse2neon.cmake | 8 +++++--- share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake | 10 ++++++++-- share/cmake/utils/CompilerFlags.cmake | 3 +++ src/OpenColorIO/CPUInfo.cpp | 2 +- src/OpenColorIO/CPUInfoConfig.h.in | 4 ++-- src/OpenColorIO/SSE.h | 6 +++--- src/OpenColorIO/SSE2.h | 6 +++--- 8 files changed, 30 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b20aa0430..f4c2a22f7a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -270,7 +270,7 @@ option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizatio option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX}) option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C}) -if (APPLE) +if (APPLE OR WIN32) # TODO: Revisit whether that option is necessary. option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations using SSE2NEON for Apple ARM architecture" ON) mark_as_advanced(OCIO_USE_SSE2NEON) @@ -332,8 +332,10 @@ if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) add_library(sse2neon INTERFACE) # Add the include directories to the target. target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") - # Ignore the warnings coming from sse2neon.h as they are false positives. - target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) + if(NOT MSVC) + # Ignore the warnings coming from sse2neon.h as they are false positives. + target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) + endif() endif() endif() diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake index 47877436a6..ced4ae139c 100644 --- a/share/cmake/modules/install/Installsse2neon.cmake +++ b/share/cmake/modules/install/Installsse2neon.cmake @@ -16,7 +16,7 @@ include(FetchContent) set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon") FetchContent_Declare(sse2neon GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git - GIT_TAG v1.6.0 + GIT_TAG 227cc413fb2d50b2a10073087be96b59d5364aea ) # FetchContent_MakeAvailable is not available until CMake 3.14+. @@ -38,6 +38,8 @@ if(NOT sse2neon_POPULATED) add_library(sse2neon INTERFACE) # Add the include directories to the target. target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") - # Ignore the warnings coming from sse2neon.h as they are false positives. - target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) + if(NOT MSVC) + # Ignore the warnings coming from sse2neon.h as they are false positives. + target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) + endif() endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake index c47c8be701..d24cda55f5 100644 --- a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake +++ b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake @@ -6,8 +6,13 @@ include(CheckCXXSourceCompiles) set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND COMPILER_SUPPORTS_ARM_NEON) +if(MSVC) + set(CMAKE_CXX_FLAGS "/Zc:preprocessor") +endif() + +if((APPLE OR WIN32) AND COMPILER_SUPPORTS_ARM_NEON) if("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -63,8 +68,9 @@ endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}") set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") unset(_cmake_required_flags_orig) unset(_cmake_required_includes_orig) unset(_cmake_osx_architectures_orig) - +unset(_cmake_cxx_flags_orig) diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index d18d7fe7cc..5f2d27366e 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -19,6 +19,9 @@ if(OCIO_USE_SIMD) if (OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) include(CheckSupportSSEUsingSSE2NEON) if(NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) + # Enable the "new" preprocessor, to more closely match Clang/GCC, required for sse2neon + set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/Zc:preprocessor") + else() set(OCIO_USE_SSE2NEON OFF) endif() endif() diff --git a/src/OpenColorIO/CPUInfo.cpp b/src/OpenColorIO/CPUInfo.cpp index dce813a4ff..59fdd1cadc 100644 --- a/src/OpenColorIO/CPUInfo.cpp +++ b/src/OpenColorIO/CPUInfo.cpp @@ -183,7 +183,7 @@ CPUInfo::CPUInfo() } } -#elif defined(__aarch64__) // ARM Processor or Apple ARM. +#elif defined(__aarch64__) || defined(_M_ARM64) // ARM Processor or Apple ARM. CPUInfo::CPUInfo() { diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index b8f5045d29..c105d4159d 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -6,7 +6,7 @@ #cmakedefine01 OCIO_ARCH_X86_32 // Relevant only for arm64 architecture. -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #cmakedefine01 OCIO_USE_SSE2NEON #else #define OCIO_USE_SSE2NEON 0 @@ -23,7 +23,7 @@ // Building for x86_64 processor on a non-ARM host architecture // OR Building on/for an ARM architecture and using SSE2NEON. -#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || (defined(__aarch64__) && OCIO_USE_SSE2NEON) +#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || ((defined(__aarch64__) || defined(_M_ARM64)) && OCIO_USE_SSE2NEON) #cmakedefine01 OCIO_USE_SSE2 #cmakedefine01 OCIO_USE_SSE3 #cmakedefine01 OCIO_USE_SSSE3 diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index 2494698c57..ac60657fd5 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -9,11 +9,11 @@ #if OCIO_USE_SSE2 // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(_M_ARM64) #if OCIO_USE_SSE2 #include #endif -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) // ARM architecture A64 (ARM64) #if OCIO_USE_SSE2NEON #include @@ -30,7 +30,7 @@ namespace OCIO_NAMESPACE // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since // it is redefining two of the functions from sse2neon. -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #if OCIO_USE_SSE2NEON // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 918694fc86..856134a2c1 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -9,9 +9,9 @@ #if OCIO_USE_SSE2 // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(_M_ARM64) #include -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) // ARM architecture A64 (ARM64) #if OCIO_USE_SSE2NEON #include @@ -30,7 +30,7 @@ namespace OCIO_NAMESPACE // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since // it is redefining two of the functions from sse2neon. -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #if OCIO_USE_SSE2NEON // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. From 79c0fc844cbd8efb21b8a50e4ac0ba08a3a02af8 Mon Sep 17 00:00:00 2001 From: Anthony Roberts Date: Fri, 8 Nov 2024 13:19:33 +0000 Subject: [PATCH 2/4] Fix improper compiler flag check Signed-off-by: Anthony Roberts --- share/cmake/utils/CompilerFlags.cmake | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index 5f2d27366e..1557091d0e 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -18,9 +18,11 @@ if(OCIO_USE_SIMD) if (OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) include(CheckSupportSSEUsingSSE2NEON) - if(NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) - # Enable the "new" preprocessor, to more closely match Clang/GCC, required for sse2neon - set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/Zc:preprocessor") + if(COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) + if(WIN32) + # Enable the "new" preprocessor, to more closely match Clang/GCC, required for sse2neon + set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/Zc:preprocessor") + endif() else() set(OCIO_USE_SSE2NEON OFF) endif() From 9977dea47bdf2b0bc2d5fbb2c87cc6d6120f97b6 Mon Sep 17 00:00:00 2001 From: Anthony Roberts Date: Fri, 8 Nov 2024 15:53:10 +0000 Subject: [PATCH 3/4] Fix sse2neon issues on Windows ARM64 Signed-off-by: Anthony Roberts --- CMakeLists.txt | 5 ++++ src/OpenColorIO/SSE.h | 17 ++++++++++++- src/OpenColorIO/SSE2.h | 25 ++++++++++++++++++- .../ops/fixedfunction/FixedFunctionOpCPU.cpp | 2 +- 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4c2a22f7a..68b37183e5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -189,6 +189,11 @@ if (NOT APPLE) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") + set(OCIO_ARCH_X86 0) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) else() set(OCIO_ARCH_X86 0) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF) diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index ac60657fd5..6aebc45d4b 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -16,7 +16,18 @@ #elif defined(__aarch64__) || defined(_M_ARM64) // ARM architecture A64 (ARM64) #if OCIO_USE_SSE2NEON + // MSVC doesn't like the redefinitions below and requires the existing functions to be undef-ed + #if defined(_M_ARM64) + #define _mm_max_ps _mm_max_ps_orig + #define _mm_min_ps _mm_min_ps_orig + #endif + #include + + #if defined(_M_ARM64) + #undef _mm_max_ps + #undef _mm_min_ps + #endif #endif #endif @@ -77,6 +88,9 @@ static const __m128 EPOS128 = _mm_set1_ps(128.0f); static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits::infinity()); +// These funtions won't work when using MSVC + ARM64 unless you specify /Zc:arm64-aliased-neon-types- +// This comes with it's own issues, so it is easier to just disable them when using MSVC + ARM64 +#if !defined(_M_ARM64) // Debug function to print out the contents of a floating-point SSE register inline void ssePrintRegister(const char* msg, __m128& reg) { @@ -91,6 +105,7 @@ inline void ssePrintRegister(const char* msg, __m128i& reg) int *r = (int*) ® printf("%s : %d %d %d %d\n", msg, r[0], r[1], r[2], r[3]); } +#endif // Determine whether a floating-point value is negative based on its sign bit. // This function will treat special values, like -0, -NaN, -Inf, as they were indeed @@ -170,7 +185,7 @@ inline __m128 sseLog2(__m128 x) { // y = log2( x ) = log2( 2^exponent * mantissa ) // = exponent + log2( mantissa ) - + __m128 mantissa = _mm_or_ps( // OR with EONE _mm_andnot_ps( // NOT(EMASK) AND x diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 856134a2c1..e51dad9b50 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -14,7 +14,30 @@ #elif defined(__aarch64__) || defined(_M_ARM64) // ARM architecture A64 (ARM64) #if OCIO_USE_SSE2NEON + // MSVC doesn't like the redefinitions below and requires the existing functions to be undef-ed + #if defined(_M_ARM64) + #define _mm_max_ps _mm_max_ps_orig + #define _mm_min_ps _mm_min_ps_orig + #endif + #include + + #if defined(_M_ARM64) + #undef _mm_max_ps + #undef _mm_min_ps + #endif + + // Current versions of MSVC do not define float16_t, so we do it ourselves using + // int16_t as an intermediate type + #if defined(_M_ARM64) && !defined(float16_t) + #define float16_t int16_t + #endif + + // Current versions of MSVC do not define vst1q_f16, so we do it ourselves using + // internal methods from MSVC's arm_neon.h + #if defined(_M_ARM64) && !defined(vst1q_f16) + #define vst1q_f16(A, B) neon_st1m_q16((A), __float16x8_t_to_n128(B)); + #endif #endif #endif @@ -321,7 +344,7 @@ struct SSE2RGBAPack sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); #if OCIO_USE_SSE2NEON - // use neon hardware support for f32 to f16 + // use neon hardware support for f32 to f16 (apart from in MSVC, which doesnt support it) float16x8_t rgba; float16x4_t rgba00_01 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba0)); float16x4_t rgba03_03 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba1)); diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp index 96adff44b6..f7554e1081 100644 --- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp +++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp @@ -1844,7 +1844,7 @@ __m128 Renderer_LIN_TO_PQ_SSE::myPower(__m128 x, __m128 exp) return ssePower(x, exp); } -#ifdef _WIN32 +#if (_MSC_VER >= 1920) && (OCIO_USE_AVX) // Only Windows compilers have built-in _mm_pow_ps() SVML intrinsic // implementation, so non-fast SIMD version is available only on Windows for // now. From 04a723a9261bcd44e5b169f7ac10b8bd98a5043f Mon Sep 17 00:00:00 2001 From: Anthony Roberts Date: Wed, 20 Nov 2024 11:59:24 +0000 Subject: [PATCH 4/4] Fix cross-compilation on Windows for X64 -> ARM64 Signed-off-by: Anthony Roberts --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68b37183e5..8f33d3a3fd 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,7 +180,12 @@ option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ociocon if (NOT APPLE) - if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|x86_64|X86|i386|i686)") + if("${CMAKE_GENERATOR_PLATFORM}" MATCHES "(ARM64|arm64)" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") + set(OCIO_ARCH_X86 0) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) + elseif ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|x86_64|X86|i386|i686)") # Intel-based architecture (not APPLE) if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(X86|i386|i686)") set(OCIO_ARCH_X86_32 1) @@ -189,11 +194,6 @@ if (NOT APPLE) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) - elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") - set(OCIO_ARCH_X86 0) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) else() set(OCIO_ARCH_X86 0) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF)