From 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 10 Apr 2023 14:35:37 +0200
Subject: [PATCH] Fix Kepler/Maxwell support for old kernel.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Torbjörn Lönnemark <tobbez@ryara.net>
Co-authored-by: jllllll <jacob201ac@yahoo.com>
---
 quant_cuda_old_kernel.cu | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
diff --git a/quant_cuda_old_kernel.cu b/quant_cuda_old_kernel.cu
index e1c302ec..23232187 100644
--- a/quant_cuda_old_kernel.cu
+++ b/quant_cuda_old_kernel.cu
@@ -3,6 +3,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+// atomicAdd for double-precision floating-point numbers on hardware with
+// compute capability < 6.0 from:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+__device__ double atomicAdd(
+    double* address,
+    double val
+) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(
+      address_as_ull,
+      assumed,
+      __double_as_longlong(val + __longlong_as_double(assumed))
+    );
+
+  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
 template <typename scalar_t>
 __global__ void VecQuant2MatMulKernel(
     const  scalar_t* __restrict__ vec,