-
Notifications
You must be signed in to change notification settings - Fork 1
/
dwt_gpu.h
137 lines (111 loc) · 4.94 KB
/
dwt_gpu.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#ifndef __DWT_GPU_H__
#define __DWT_GPU_H__
#include "cuda.h"
#include "cuda_timer.h"
// debug output
void printArray(const float *array, int width, int height, int depth,
const char *name = NULL);
void printArray(const int *array, int width, int height, int depth,
const char *name = NULL);
void printDeviceArray(const float *array_dev, int width, int height, int depth,
const char *name = NULL);
void printDeviceArray(const float *array_dev, scu_wavelet::int3 size,
const char *name = NULL);
void printDeviceArray(const int *array_dev, int width, int height, int depth,
const char *name = NULL);
void printDeviceArray(const int *array_dev, scu_wavelet::int3 size,
const char *name = NULL);
/*
CUDA implementation of Haar discrete wavelet transform.
Ed Karrels, [email protected], June 2014
*/
class WaveletAtomic {
public:
// I borrowed this code from CUDA/lloyds/cudalloyds.cu, thanks David :-)
__device__ static float max(float* address, float val) {
int* address_as_i = (int*) address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed,
__float_as_int(::fmaxf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
__device__ static float min(float* address, float val) {
int* address_as_i = (int*) address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed,
__float_as_int(::fminf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
};
/*
Forward transform on a square 2-d array.
arrayWidth is the length of a row in the data.
transformLength is the size of the upper-left part of the array
that will be transformed. For example, in the third pass on a
1024x1024 array, arrayWidth would be 1024 and transformLength would
be 256, and array elements [0..255][0..255] would be modified.
*/
template<typename NUM>
__global__ void haar_2d_kernel
(int arrayWidth, int transformLength, NUM *data, NUM *temp);
// Same as haar_not_lifting_2d_kernel, but the inverse transform
template<typename NUM>
__global__ void haar_inv_2d_kernel
(int arrayWidth, int transformLength, NUM *data, NUM *temp);
// Wrapper functions that call the CUDA functions above.
// Even though the functions above can be called with NUM as any type,
// use these wrappers to make it difficult to use them for anything other
// than floats or doubles, since those are they only things that have
// been tested.
float haar_2d_cuda
(int size, float *data, bool inverse = false, int stepCount = -1,
int threadBlockSize = 128, bool useCombinedTranspose = true);
// double support was added in version 1.3
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 130)
float haar_2d_cuda
(int size, double *data, bool inverse = false, int stepCount = -1,
int threadBlockSize = 128, bool useCombinedTranspose = true);
#endif
template<typename NUM>
__global__ void haar_transpose_2d_kernel
(int arrayWidth, int transformLength, NUM *data, NUM *result,
int tileSize);
template<typename NUM>
__global__ void haar_inv_transpose_2d_kernel
(int arrayWidth, int transformLength, NUM *data, NUM *result, int tileSize);
// transform the data and update 'size', since the dimensions will rotate
void haar_3d_cuda(float *data, float *tmpData,
scu_wavelet::int3 &size, scu_wavelet::int3 stepCount,
bool inverse = false,
CudaTimer *transformTimer = NULL,
CudaTimer *transposeTimer = NULL);
void cdf97_3d_cuda(float *data, float *tmpData,
scu_wavelet::int3 &size, scu_wavelet::int3 stepCount,
bool inverse = false,
CudaTimer *transformTimer = NULL,
CudaTimer *transposeTimer = NULL);
int bestHaarGPUTileSize();
void haar_v2(float *data_in, float *data_tmp,
scu_wavelet::int3 size, int level_count,
CudaTimer *transformTimer = NULL);
void haar_3d_cuda_v2(float *data, float *data_tmp, scu_wavelet::int3 &size,
scu_wavelet::int3 stepCount, bool inverse,
CudaTimer *transformTimer, CudaTimer *transposeTimer);
void cdf97_v2(float *data, float *data_tmp,
scu_wavelet::int3 size, int level_count,
CudaTimer *transformTimer);
void cdf97_3d_cuda_v2(float *data, float *tmpData,
scu_wavelet::int3 &size, scu_wavelet::int3 stepCount,
bool inverse,
CudaTimer *transformTimer, CudaTimer *transposeTimer);
void cdf97_3d_cuda_v3(float *data, float *tmpData,
scu_wavelet::int3 &size, scu_wavelet::int3 stepCount,
bool inverse,
CudaTimer *transformTimer, CudaTimer *transposeTimer);
#endif // __DWT_GPU_H__