-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCudaFunctions.cu
249 lines (220 loc) · 9.52 KB
/
CudaFunctions.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include "math.h"
#include <assert.h>
#include "main.h"
#include "sm_60_atomic_functions.h"
#include <stdio.h>
// ======================================= Service methods =======================================
//int offset = (blockIdx.y * blockDim.y + threadIdx.y) * (N - W + 1) + blockIdx.x * blockDim.x + threadIdx.x;
//int offset = blockIdx.y * blockDim.y * N + blockIdx.x * blockDim.x + threadIdx.y * N + threadIdx.x;
__host__ void allocateMatOnGPU(Matrix image, int** deviceImage)
{
int colorsInImage = image.size * image.size;
cudaError_t error = cudaSuccess;
// Allocates and copies the object to GPU
error = cudaMalloc(deviceImage, colorsInImage * sizeof(int));
if (error != cudaSuccess)
{
printf("Cannot allocate GPU memory for image: %s (%d)\n", cudaGetErrorString(error), error);
exit(0);
}
error = cudaMemcpy(*deviceImage, image.data, colorsInImage * sizeof(int), cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("Cannot copy image to GPU: %s (%d)\n", cudaGetErrorString(error), error);
exit(0);
}
}
__host__ void freeMatFromGPU(int** deviceMat)
{
cudaError_t error = cudaSuccess;
// Free the picture from GPU memory
error = cudaFree(*deviceMat);
if (error != cudaSuccess)
{
printf("Cannot free Mat from GPU: %s (%d)\n", cudaGetErrorString(error), error);
exit(0);
}
}
__device__ int getPositionsPerDimension(int pictureDim, int objectDim) // called from both CPU and GPU
{
return (pictureDim - objectDim) + 1;
}
__device__ double difference(int p, int o)
{
// printf("p %d o %d 0:%f 1: %f 2:%d\n", p, o, (double)(p - o) / p, ceil((double)(p - o) / p), abs(ceil((double)(p - o) / p)));
//printf("p-o %lf calculate:%lf final %lf\n", (double)(p - o), (double)(p - o) / p, fabs((double)(p - o) / p));
return fabs((double)(p - o) / p);
}
// __global__ void findMatchingSubmatrix(int* deviceSubmatrix, int* deviceBigMatrix, int subMatrixDim, int bigMatrixDim, double* matching, int* positionFlags, double matchingV)
// {
// int tid = threadIdx.x + blockIdx.x * blockDim.x;
// int checkDim = pow(getPositionsPerDimension(bigMatrixDim, subMatrixDim), 2); //max size of matrix dim being checked
// if (tid < checkDim * checkDim) // make sure we access the required memory of elements
// {
// int i = tid / checkDim; // picture matrix row
// int j = tid % checkDim; // picture matrix col
// double sum = 0.0;
// for (int subMatrixOffset = 0; subMatrixOffset < subMatrixDim * subMatrixDim; subMatrixOffset++)
// {
// int r = subMatrixOffset / subMatrixDim; // object matrix row
// int c = subMatrixOffset % subMatrixDim; // object matrix col
// int idx = (i + r) * bigMatrixDim + j + c; // offset of the picture matrix
// assert(idx < bigMatrixDim * bigMatrixDim);
// sum += difference(deviceBigMatrix[idx], deviceSubmatrix[subMatrixOffset]);
// }
// matching[i * checkDim + j] = sum / (subMatrixDim * subMatrixDim);
// positionFlags[i * checkDim + j] = (matching[i * checkDim + j] < matchingV) ? 1 : 0;
// }
// }
__global__ void sliding_window_kernel(int* M, int M_size, int* N, int N_size, double* matching, int* positionFlags, double matchingV) {
// // Compute the thread index
// int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// int sizeResults = getPositionsPerDimension(M_size, N_size);
// printf("\n\n\nthread id %d", thread_id);
// // Compute the row and column indices of the window
// int window_row = thread_id / sizeResults;
// int window_col = thread_id % sizeResults;
// printf("\n\n\nwindow row %d window col %d", window_row, window_col);
// // Check if the window is within the bounds of the larger matrix
// if ((window_row + N_size) < M_size && (window_col + N_size) < M_size) {
// printf("\n\n\nthread id passed %d", thread_id);
// // Check if the current window matches the smaller matrix
// int matchingCalc = 0;
// for (int i = 0; i < N_size; i++) {
// for (int j = 0; j < N_size; j++) {
// int M_index = (window_row + i) * M_size + (window_col + j);
// int N_index = i * N_size + j;
// printf("\n\n\nM index %d row %d col %d N Index %d row %d col %d picture size %d object size %d", M_index, M_index / M_size, M_index % M_size, N_index, N_index / N_size, N_index % N_size, M_size, N_size);
// assert(M_index < M_size * M_size);
// assert(N_index < N_size * N_size);
// matchingCalc += difference(M[M_index], N[N_index]);
// }
// }
// // Store the result of the match in the output array
// int index = (window_row)*M_size + (window_col);
// assert(index < sizeResults * sizeResults);
// matching[index] = matchingCalc / pow(sizeResults, 2);
// if (matching[index] < matchingV)
// positionFlags[index] = 1;
// }
// Compute the thread index
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int sizeResults = getPositionsPerDimension(M_size, N_size);
// for (int i = 0;i < sizeResults * sizeResults;i++)
// assert(positionFlags[i] == 0);
// Compute the row and column indices of the window
int window_row = thread_id / sizeResults;
int window_col = thread_id % sizeResults;
// Check if the window is within the bounds of the larger matrix
if ((window_row + N_size) <= M_size && (window_col + N_size) <= M_size) {
// Check if the current window matches the smaller matrix
double matchingCalc = 0.0;
for (int i = 0; i < N_size; i++) {
for (int j = 0; j < N_size; j++) {
int M_index = (window_row + i) * M_size + (window_col + j);
int N_index = i * N_size + j;
assert(M_index < M_size * M_size);
assert(N_index < N_size * N_size);
matchingCalc += difference(M[M_index], N[N_index]);
}
}
// Store the result of the match in the output array
int index = window_row * sizeResults + window_col;
assert(index < sizeResults * sizeResults);
assert(positionFlags[index] == 0);
matching[index] = matchingCalc / (N_size * N_size);
if (matching[index] < matchingV) {
positionFlags[index] = 1;
}
}
}
// ======================================= Entry Point =======================================
__host__ int* searchOnGPU(int pictureDim, int* devicePictureMatrix, Matrix object, double matchingV)
{
int positionsPerDim = pictureDim - object.size + 1, positionsCount = pow(positionsPerDim, 2);
// printf("\npositionsCount %d\n", positionsCount);
int blocksPerGrid = (positionsCount + BLOCK_SIZE - 1) / BLOCK_SIZE;
int* hostPositionFlagsArray, * devicePositionFlagsArray, * deviceObjectMatrix;
double* deviceMatchingsArray;
cudaError_t error = cudaSuccess;
// Allocates memory for the position flags array
hostPositionFlagsArray = (int*)malloc(positionsCount * sizeof(int));
if (hostPositionFlagsArray == NULL)
{
printf("Cannot allocate memory for position flags array\n");
return NULL;
}
// Allocates and initializes required variables on the GPU
allocateMatOnGPU(object, &deviceObjectMatrix);
error = cudaMalloc((void**)&devicePositionFlagsArray, positionsCount * sizeof(int));
if (error != cudaSuccess)
{
printf("Cannot allocate GPU memory for position flags array: %s (%d)\n", cudaGetErrorString(error), error);
return NULL;
}
error = cudaMalloc((void**)&deviceMatchingsArray, positionsCount * sizeof(double));
// printf("\n\npositionsCount %d\n\n", positionsCount);
if (error != cudaSuccess)
{
printf("Cannot allocate GPU memory for matchings array: %s (%d)\n", cudaGetErrorString(error), error);
return NULL;
}
error = cudaMemset(devicePositionFlagsArray, 0, positionsCount * sizeof(int));
if (error != cudaSuccess)
{
printf("Cannot initialize position flags array on GPU: %s (%d)\n", cudaGetErrorString(error), error);
return NULL;
}
// if (blocksPerGrid > max_blocks_per_grid) {
// // Too many blocks!
// printf("\nToo many blocks!\n");
// exit(EXIT_FAILURE);
// }
// if (threadsPerBlock > max_threads_per_block) {
// // Too many threads per block!
// printf("\nToo many threads per block!\n");
// exit(EXIT_FAILURE);
// }
printf("\n\n\n m size %d n size %d line %d\n\n", pictureDim, object.size, __LINE__);
cudaDeviceSynchronize();
error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA error: %s line %d\n", cudaGetErrorString(error), __LINE__);
}
// Searches the object in the picture using CUDA - each block searches 256 positions in the picture
sliding_window_kernel << <blocksPerGrid, BLOCK_SIZE >> > (devicePictureMatrix, pictureDim, deviceObjectMatrix, object.size, deviceMatchingsArray, devicePositionFlagsArray, matchingV);
// findMatchingSubmatrix << <blocksPerGrid, threadsPerBlock >> > (deviceObjectMatrix, devicePictureMatrix, object.size, pictureDim, deviceMatchingsArray, devicePositionFlagsArray, matchingV);
// Check for errors after the kernel call
cudaDeviceSynchronize();
error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA error: %s\n", cudaGetErrorString(error));
return NULL;
}
// Copies the position flags array from GPU to host
error = cudaMemcpy(hostPositionFlagsArray, devicePositionFlagsArray, positionsCount * sizeof(int), cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
printf("Cannot copy position flags from GPU to host: %s (%d)\n", cudaGetErrorString(error), error);
return NULL;
}
// Frees allocated variables from the GPU
error = cudaFree(deviceMatchingsArray);
if (error != cudaSuccess)
{
printf("Cannot free matchings array from GPU: %s (%d)\n", cudaGetErrorString(error), error);
return NULL;
}
error = cudaFree(devicePositionFlagsArray);
if (error != cudaSuccess)
{
printf("Cannot free position flags array from GPU: %s (%d)\n", cudaGetErrorString(error), error);
return NULL;
}
freeMatFromGPU(&deviceObjectMatrix);
return hostPositionFlagsArray;
}