From 867d69a670289f589f118ca64a76400d8aadb73c Mon Sep 17 00:00:00 2001 From: Richard Chapman Date: Wed, 24 Jul 2024 16:29:42 +0100 Subject: [PATCH] ARM64 constant-folding support Signed-off-by: Richard Chapman --- cmake_modules/vcpkg.cmake | 4 +- common/thorhelper/thorcommon.hpp | 3 +- ecl/hql/hqlfold.cpp | 139 ++++++++++++++++++++++++++++++- ecl/hql/hqlstack.hpp | 10 +-- vcpkg-osx.code-workspace | 8 +- 5 files changed, 149 insertions(+), 15 deletions(-) diff --git a/cmake_modules/vcpkg.cmake b/cmake_modules/vcpkg.cmake index d79e28dff18..59b4e01cdf9 100644 --- a/cmake_modules/vcpkg.cmake +++ b/cmake_modules/vcpkg.cmake @@ -12,8 +12,8 @@ if(WIN32) set(VCPKG_HOST_TRIPLET "x64-windows" CACHE STRING "host triplet") set(VCPKG_TARGET_TRIPLET "x64-windows" CACHE STRING "target triplet") elseif(APPLE) - set(VCPKG_HOST_TRIPLET "x64-osx" CACHE STRING "host triplet") - set(VCPKG_TARGET_TRIPLET "x64-osx" CACHE STRING "target triplet") + set(VCPKG_HOST_TRIPLET "arm64-osx" CACHE STRING "host triplet") + set(VCPKG_TARGET_TRIPLET "arm64-osx" CACHE STRING "target triplet") elseif(UNIX) set(VCPKG_HOST_TRIPLET "x64-linux-dynamic" CACHE STRING "host triplet") set(VCPKG_TARGET_TRIPLET "x64-linux-dynamic" CACHE STRING "target triplet") diff --git a/common/thorhelper/thorcommon.hpp b/common/thorhelper/thorcommon.hpp index c9d803e2061..baab17d66be 100644 --- a/common/thorhelper/thorcommon.hpp +++ b/common/thorhelper/thorcommon.hpp @@ -362,8 +362,9 @@ class BlockedActivityTimer } }; #else -struct ActivityTimer +class ActivityTimer { +public: inline ActivityTimer(ActivityTimeAccumulator &_accumulator, const bool _enabled) { } }; struct SimpleActivityTimer diff --git a/ecl/hql/hqlfold.cpp b/ecl/hql/hqlfold.cpp index 4a0226cbcfd..b4add92d82e 100644 --- a/ecl/hql/hqlfold.cpp +++ b/ecl/hql/hqlfold.cpp @@ -1187,7 +1187,7 @@ IValue * doFoldExternalCall(IHqlExpression* expr, unsigned foldOptions, const ch ); } #endif - assertex((len & 7) == 4); // We need to make sure we add an ODD number of words to stack, so that it gets 8-byte aligned once pc is pushed by the call + assertex((len & 7) == 4); // We need to make sure we add an ODD number of words to stack, so that it gets #8-byte aligned once pc is pushed by the call register unsigned _intresult asm("r0"); // Specific register for result register unsigned _intresulthigh asm("r1"); // Specific register for result register unsigned _poplen asm("r4") = len-REGPARAMS*REGSIZE; // Needs to survive the call @@ -1248,7 +1248,142 @@ IValue * doFoldExternalCall(IHqlExpression* expr, unsigned foldOptions, const ch } #elif defined(_ARCH_ARM64_) // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf - UNIMPLEMENTED; + #ifdef MAXFPREGS + void * floatstack = fstack.getFloatMem(); + if (floatstack) { + unsigned * floatSizes = fstack.getFloatSizes(); + __asm__ __volatile__ ( + ".doparm0: \n\t" + "ldr w0,[%[sizes],#0] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d0,[%[vals], #0] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s0,[%[vals], #0] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#4] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d1,[%[vals], #8] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s1,[%[vals], #8] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#8] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d2,[%[vals], #16] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s2,[%[vals], #16] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#12] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d3,[%[vals], #24] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s3,[%[vals], #24] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#16] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d4,[%[vals], #32] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s4,[%[vals], #32] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#20] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d5,[%[vals], #40] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s5, [%[vals], #40] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#24] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d6,[%[vals], #48] \n\t" + "b 1f \n\t" + "0: \n\t" + "ldr s6,[%[vals], #48] \n\t" + "1: \n\t" + "ldr w0,[%[sizes],#28] \n\t" + "cmp w0, #4 \n\t" + "blt 9f \n\t" + "beq 0f \n\t" + "ldr d7,[%[vals], #56] \n\t" + "b 9f \n\t" + "0: \n\t" + "ldr s7,[%[vals], #56] \n\t" + "9: \n\t" + "nop \n\t" + : + : [vals] "r"(floatstack), [sizes] "r"(floatSizes) + : "r0" + ); + } + #endif + assertex((len & 15) == 0); // Stack must always be 16-byte aligned + register unsigned __int64 _int64result asm("x0"); // Specific register for result + register unsigned __int64 _len asm("x1") = len; + register unsigned __int64 _poplen asm("x19") = len-REGPARAMS*REGSIZE; // Needs to survive the call + register void *_fh asm("x8") = fh; // Needs to survive until the call + __asm__ __volatile__ ( + "sub sp, sp, %[_len] \n\t" // Make space on stack + "mov x2, sp \n\t" // r2 = destination for loop + ".repLoop: \n\t" + "ldrb w3, [%[strbuf]], #1 \n\t" // copy a byte from src array to r3 + "strb w3, [x2], #1 \n\t" // and then from r3 onto stack + "subs %[_len], %[_len], #1 \n\t" // decrement and repeat + "bne .repLoop \n\t" + "ldp x0, x1, [sp] \n\t" + "ldp x2, x3, [sp, #16] \n\t" + "ldp x4, x5, [sp, #32] \n\t" + "ldp x6, x7, [sp, #48] \n\t" + "add sp, sp, #64 \n\t" // first #8 parameters go in registers + "blr %[fh] \n\t" // make the call + "add sp, sp, %[_poplen] \n\t" // Restore stack pointer (note, have already popped 8 registers, so poplen is len - 64) + : "=r"(_int64result) + : [_len] "r"(_len), [_poplen] "r"(_poplen), [strbuf] "r"(strbuf), [fh] "r"(_fh) + : "x2","x3","x4","x5","x6","x7","lr" // function we call may corrupt lr + ); + int64result = _int64result; + if (isRealvalue) + { + #ifdef MAXFPREGS + if(resultsize <= 4) + { + __asm__ __volatile__( + "str s0,[%[fresult]] \n\t" + :"=m"(floatresult) + : [fresult] "r"(&(floatresult)) + ); + } + else + { + __asm__ __volatile__( + "str d0,[%[fresult]] \n\t" + :"=m"(doubleresult) + : [fresult] "r"(&(doubleresult)) + ); + } + #else + if(resultsize <= 4) + floatresult = *(float*)&intresult; + else + doubleresult = *(double*)&intresult; + #endif + } #else // Unknown architecture UNIMPLEMENTED; diff --git a/ecl/hql/hqlstack.hpp b/ecl/hql/hqlstack.hpp index 5b95ac3e7df..54066b3dc9c 100644 --- a/ecl/hql/hqlstack.hpp +++ b/ecl/hql/hqlstack.hpp @@ -58,14 +58,8 @@ #define ALIGN_USES_ELEMENTSIZE #define REGSIZE 8 #define REGPARAMS 8 - #define ODD_STACK_ALIGNMENT - #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) \ - && defined(__ARM_EABI__) && !defined(__ARM_PCS_VFP) && !defined(__ARM_PCS) - #error "Can't identify floating point calling conventions.\nPlease ensure that your toolchain defines __ARM_PCS or __ARM_PCS_VFP." - #endif - #if defined(__ARM_PCS_VFP) - #define MAXFPREGS 8 // d0-d7 - #endif + #define MAXFPREGS 8 + #define EVEN_STACK_ALIGNMENT #elif defined (_ARCH_ARM32_) #define ALIGNMENT 4 #define ALIGN_USES_ELEMENTSIZE diff --git a/vcpkg-osx.code-workspace b/vcpkg-osx.code-workspace index d25c0390fe3..38ee697b857 100644 --- a/vcpkg-osx.code-workspace +++ b/vcpkg-osx.code-workspace @@ -7,8 +7,9 @@ "settings": { "cmake.configureArgs": [ "-DCONTAINERIZED=OFF", + "-DCMAKE_OSX_ARCHITECTURES=arm64", "-DUSE_OPTIONAL=OFF", - "-DINCLUDE_PLUGINS=ON", + "-DINCLUDE_PLUGINS=OFF", "-DSUPPRESS_V8EMBED=ON", "-DSUPPRESS_REMBED=ON", "-DSUPPRESS_MEMCACHED=ON", @@ -18,6 +19,9 @@ "-DUSE_CPPUNIT=ON", "-DUSE_JAVA=OFF" ], + "files.associations": { + "*.ipp": "cpp" + }, }, "extensions": { "recommendations": [ @@ -27,4 +31,4 @@ "twxs.cmake", ] } -} \ No newline at end of file +}