ARM64 constant-folding support

Signed-off-by: Richard Chapman <[email protected]>
hpcc-systems · Jul 24, 2024 · 867d69a · 867d69a
1 parent 4b3d714
commit 867d69a
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 15 deletions.
diff --git a/cmake_modules/vcpkg.cmake b/cmake_modules/vcpkg.cmake
@@ -12,8 +12,8 @@ if(WIN32)
     set(VCPKG_HOST_TRIPLET "x64-windows" CACHE STRING "host triplet")
     set(VCPKG_TARGET_TRIPLET "x64-windows" CACHE STRING "target triplet")
 elseif(APPLE)
-    set(VCPKG_HOST_TRIPLET "x64-osx" CACHE STRING "host triplet")
-    set(VCPKG_TARGET_TRIPLET "x64-osx" CACHE STRING "target triplet")
+    set(VCPKG_HOST_TRIPLET "arm64-osx" CACHE STRING "host triplet")
+    set(VCPKG_TARGET_TRIPLET "arm64-osx" CACHE STRING "target triplet")
 elseif(UNIX)
     set(VCPKG_HOST_TRIPLET "x64-linux-dynamic" CACHE STRING "host triplet")
     set(VCPKG_TARGET_TRIPLET "x64-linux-dynamic" CACHE STRING "target triplet")

diff --git a/common/thorhelper/thorcommon.hpp b/common/thorhelper/thorcommon.hpp
@@ -362,8 +362,9 @@ class BlockedActivityTimer
     }
 };
 #else
-struct ActivityTimer
+class ActivityTimer
 {
+public:
     inline ActivityTimer(ActivityTimeAccumulator &_accumulator, const bool _enabled) { }
 };
 struct SimpleActivityTimer

diff --git a/ecl/hql/hqlfold.cpp b/ecl/hql/hqlfold.cpp
@@ -1187,7 +1187,7 @@ IValue * doFoldExternalCall(IHqlExpression* expr, unsigned foldOptions, const ch
            );
         }
   #endif
-        assertex((len & 7) == 4);  // We need to make sure we add an ODD number of words to stack, so that it gets 8-byte aligned once pc is pushed by the call
+        assertex((len & 7) == 4);  // We need to make sure we add an ODD number of words to stack, so that it gets #8-byte aligned once pc is pushed by the call
         register unsigned _intresult asm("r0");                       // Specific register for result
         register unsigned _intresulthigh asm("r1");                   // Specific register for result
         register unsigned _poplen asm("r4") = len-REGPARAMS*REGSIZE;  // Needs to survive the call
@@ -1248,7 +1248,142 @@ IValue * doFoldExternalCall(IHqlExpression* expr, unsigned foldOptions, const ch
         }
  #elif defined(_ARCH_ARM64_)
         // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
-        UNIMPLEMENTED;
+  #ifdef MAXFPREGS
+        void * floatstack = fstack.getFloatMem();
+        if (floatstack) {
+            unsigned * floatSizes = fstack.getFloatSizes();
+           __asm__ __volatile__ (
+           ".doparm0: \n\t"
+               "ldr  w0,[%[sizes],#0] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d0,[%[vals], #0] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s0,[%[vals], #0] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#4] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d1,[%[vals], #8] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s1,[%[vals], #8] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#8] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d2,[%[vals], #16] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s2,[%[vals], #16] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#12] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d3,[%[vals], #24] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s3,[%[vals], #24] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#16] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d4,[%[vals], #32] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s4,[%[vals], #32] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#20] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d5,[%[vals], #40] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s5, [%[vals], #40] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#24] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d6,[%[vals], #48] \n\t"
+               "b 1f \n\t"
+           "0: \n\t"
+               "ldr  s6,[%[vals], #48] \n\t"
+           "1: \n\t"
+               "ldr  w0,[%[sizes],#28] \n\t"
+               "cmp  w0, #4 \n\t"
+               "blt  9f \n\t"
+               "beq  0f \n\t"
+               "ldr  d7,[%[vals], #56] \n\t"
+               "b 9f \n\t"
+           "0: \n\t"
+               "ldr  s7,[%[vals], #56] \n\t"
+           "9: \n\t"
+               "nop \n\t"
+            :
+            : [vals] "r"(floatstack), [sizes] "r"(floatSizes)
+            : "r0"
+           );
+        }
+  #endif
+        assertex((len & 15) == 0);                                              // Stack must always be 16-byte aligned
+        register unsigned __int64 _int64result asm("x0");                       // Specific register for result
+        register unsigned __int64 _len asm("x1") = len;
+        register unsigned __int64 _poplen asm("x19") = len-REGPARAMS*REGSIZE;   // Needs to survive the call
+        register void *_fh asm("x8") = fh;                                      // Needs to survive until the call
+        __asm__ __volatile__ (
+            "sub sp, sp, %[_len] \n\t"        // Make space on stack
+            "mov x2, sp \n\t"                 // r2 = destination for loop
+            ".repLoop: \n\t"
+            "ldrb w3, [%[strbuf]], #1 \n\t"   // copy a byte from src array to r3
+            "strb w3, [x2], #1 \n\t"          // and then from r3 onto stack
+            "subs %[_len], %[_len], #1 \n\t"  // decrement and repeat
+            "bne .repLoop \n\t"
+            "ldp x0, x1, [sp] \n\t"
+            "ldp x2, x3, [sp, #16] \n\t"
+            "ldp x4, x5, [sp, #32] \n\t"
+            "ldp x6, x7, [sp, #48] \n\t"
+            "add sp, sp, #64 \n\t"            // first #8 parameters go in registers
+            "blr %[fh] \n\t"                  // make the call
+            "add sp, sp, %[_poplen] \n\t"     // Restore stack pointer (note, have already popped 8 registers, so poplen is len - 64)
+            : "=r"(_int64result)
+            : [_len] "r"(_len), [_poplen] "r"(_poplen), [strbuf] "r"(strbuf), [fh] "r"(_fh)
+            : "x2","x3","x4","x5","x6","x7","lr"                  // function we call may corrupt lr
+            );
+        int64result = _int64result;
+        if (isRealvalue)
+        {
+  #ifdef MAXFPREGS
+            if(resultsize <= 4)
+            {
+                __asm__  __volatile__(
+                    "str  s0,[%[fresult]] \n\t"
+                    :"=m"(floatresult)
+                    : [fresult] "r"(&(floatresult))
+                );
+            }
+            else
+            {
+                __asm__  __volatile__(
+                    "str  d0,[%[fresult]] \n\t"
+                    :"=m"(doubleresult)
+                    : [fresult] "r"(&(doubleresult))
+                );
+            }
+  #else
+            if(resultsize <= 4)
+                floatresult = *(float*)&intresult;
+            else
+                doubleresult = *(double*)&intresult;
+  #endif
+        }
  #else
         // Unknown architecture
         UNIMPLEMENTED;

diff --git a/ecl/hql/hqlstack.hpp b/ecl/hql/hqlstack.hpp
@@ -58,14 +58,8 @@
  #define ALIGN_USES_ELEMENTSIZE
  #define REGSIZE 8
  #define REGPARAMS 8
- #define ODD_STACK_ALIGNMENT
- #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) \
-     && defined(__ARM_EABI__) && !defined(__ARM_PCS_VFP) && !defined(__ARM_PCS)
-  #error "Can't identify floating point calling conventions.\nPlease ensure that your toolchain defines __ARM_PCS or __ARM_PCS_VFP."
- #endif
- #if defined(__ARM_PCS_VFP)
-  #define MAXFPREGS 8 // d0-d7
- #endif
+ #define MAXFPREGS 8
+ #define EVEN_STACK_ALIGNMENT
 #elif defined (_ARCH_ARM32_)
  #define ALIGNMENT 4
  #define ALIGN_USES_ELEMENTSIZE

diff --git a/vcpkg-osx.code-workspace b/vcpkg-osx.code-workspace
@@ -7,8 +7,9 @@
 	"settings": {
 		"cmake.configureArgs": [
 			"-DCONTAINERIZED=OFF",
+            "-DCMAKE_OSX_ARCHITECTURES=arm64",
 			"-DUSE_OPTIONAL=OFF",
-			"-DINCLUDE_PLUGINS=ON",
+			"-DINCLUDE_PLUGINS=OFF",
 			"-DSUPPRESS_V8EMBED=ON",
 			"-DSUPPRESS_REMBED=ON",
 			"-DSUPPRESS_MEMCACHED=ON",
@@ -18,6 +19,9 @@
 			"-DUSE_CPPUNIT=ON",
 			"-DUSE_JAVA=OFF"
 		],
+		"files.associations": {
+			"*.ipp": "cpp"
+		},
 	},
 	"extensions": {
 		"recommendations": [
@@ -27,4 +31,4 @@
 			"twxs.cmake",
 		]
 	}
-}
+}