forked from cms-patatrack/pixeltrack-standalone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cudaCompat.h
103 lines (84 loc) · 2.1 KB
/
cudaCompat.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#ifndef HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
#define HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
/*
* Everything you need to run cuda code in plain sequential c++ code
*/
#ifndef __CUDACC__
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <cuda_runtime.h>
namespace cudaCompat {
#ifndef __CUDA_RUNTIME_H__
struct dim3 {
uint32_t x, y, z;
};
#endif
const dim3 threadIdx = {0, 0, 0};
const dim3 blockDim = {1, 1, 1};
extern thread_local dim3 blockIdx;
extern thread_local dim3 gridDim;
template <typename T1, typename T2>
T1 atomicInc(T1* a, T2 b) {
auto ret = *a;
if ((*a) < T1(b))
(*a)++;
return ret;
}
template <typename T1, typename T2>
T1 atomicAdd(T1* a, T2 b) {
auto ret = *a;
(*a) += b;
return ret;
}
template <typename T1, typename T2>
T1 atomicSub(T1* a, T2 b) {
auto ret = *a;
(*a) -= b;
return ret;
}
template <typename T1, typename T2>
T1 atomicMin(T1* a, T2 b) {
auto ret = *a;
*a = std::min(*a, T1(b));
return ret;
}
template <typename T1, typename T2>
T1 atomicMax(T1* a, T2 b) {
auto ret = *a;
*a = std::max(*a, T1(b));
return ret;
}
inline void __syncthreads() {}
inline void __threadfence() {}
inline bool __syncthreads_or(bool x) { return x; }
inline bool __syncthreads_and(bool x) { return x; }
template <typename T>
inline T __ldg(T const* x) {
return *x;
}
inline void resetGrid() {
blockIdx = {0, 0, 0};
gridDim = {1, 1, 1};
}
} // namespace cudaCompat
// some not needed as done by cuda runtime...
#ifndef __CUDA_RUNTIME_H__
#define __host__
#define __device__
#define __global__
#define __shared__
#define __forceinline__
#endif
// make sure function are inlined to avoid multiple definition
#ifndef __CUDA_ARCH__
#undef __global__
#define __global__ inline __attribute__((always_inline))
#undef __forceinline__
#define __forceinline__ inline __attribute__((always_inline))
#endif
#ifndef __CUDA_ARCH__
using namespace cudaCompat;
#endif
#endif
#endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h