-
Notifications
You must be signed in to change notification settings - Fork 0
/
shared.cu
264 lines (228 loc) · 9.36 KB
/
shared.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
/*
----------------------------------------------------------
| Title: Mean Shift Clustering with CUDA (Shared) |
| Author: Giannis Meleziadis |
----------------------------------------------------------
| Description: |
| CUDA implementation of Mean Shift clustering |
| on 600 2D points using shared memory. |
| Reads input, clusters using GPU, writes output, |
| and verifies against a reference. |
----------------------------------------------------------
| Compilation: |
| nvcc shared.cu -o shared -O3 -lm |
----------------------------------------------------------
| Execution: |
| ./shared 0.5 |
----------------------------------------------------------
| Files: |
| input.txt - Input points |
| output.txt - Clustered output |
| output_reference.txt - Reference for validation |
----------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <sys/time.h>
#include <float.h>
#include <assert.h>
float k = 20; // Factor to scale tolerance, compensating for numerical errors
#define NUM_OF_POINTS 600
#define DIMENSIONS 2
#define BLOCK_SIZE 512 // Adjust based on the maximum threads per block your GPU supports
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
// CUDA kernel to calculate Euclidean distance between two points
__device__ float distanceFunction(const float *point1, const float *point2) {
float distance = 0;
for (int i = 0; i < DIMENSIONS; i++) {
distance += (point1[i] - point2[i]) * (point1[i] - point2[i]);
}
return sqrt(distance);
}
// CUDA kernel to calculate the weight using Gaussian kernel function
__device__ float kernelFunction(const float *pointYk, const float *arrayXi) {
float s = 1;
float distance = distanceFunction(pointYk, arrayXi);
distance *= distance;
return exp(-((distance) / (2 * (s * s))));
}
// CUDA kernel to calculate the magnitude of movement vector
__device__ float movedDistance(const float *moved) {
float distance = 0;
for (int i = 0; i < DIMENSIONS; i++) {
distance += (moved[i]) * (moved[i]);
}
return sqrt(distance);
}
// Main kernel function that performs the MeanShift algorithm on given data points using shared memory
__global__ void shiftingFunction(float *Ykplus1, float *Yk, const float *X, float e) {
float s = 1;
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= NUM_OF_POINTS) return; // Ensure thread index is within bounds
__shared__ float sharedX[NUM_OF_POINTS * DIMENSIONS];
// Load data into shared memory: each thread loads its part
int numThreads = blockDim.x * gridDim.x;
int loadIdx = threadIdx.x + blockIdx.x * blockDim.x;
while (loadIdx < NUM_OF_POINTS) {
for (int dim = 0; dim < DIMENSIONS; dim++) {
sharedX[loadIdx * DIMENSIONS + dim] = X[loadIdx * DIMENSIONS + dim];
}
loadIdx += numThreads;
}
__syncthreads(); // Ensure all data is loaded
float Ypoint[DIMENSIONS], Xpoint[DIMENSIONS], moved[DIMENSIONS];
float numerator[DIMENSIONS] = {0}, denominator = 0;
// Initialize moved to force at least one iteration
for (int dim = 0; dim < DIMENSIONS; dim++) {
moved[dim] = FLT_MAX;
}
float weightFromGaussian = 0;
// Iteratively shift point until the movement is below the threshold 'e'
while (movedDistance(moved) >= e) {
// Load current point coordinates
for (int i = 0; i < DIMENSIONS; i++) {
Ypoint[i] = Yk[index * DIMENSIONS + i];
}
// Reset sums for the new iteration
for (int i = 0; i < DIMENSIONS; i++) {
numerator[i] = 0;
}
denominator = 0;
// Evaluate each point in X
for (int i = 0; i < NUM_OF_POINTS; i++) {
for (int j = 0; j < DIMENSIONS; j++) {
Xpoint[j] = sharedX[i * DIMENSIONS + j];
}
float check = distanceFunction(Ypoint, Xpoint);
if (check <= s * s && check > 0) {
weightFromGaussian = kernelFunction(Ypoint, Xpoint);
for (int j = 0; j < DIMENSIONS; j++) {
numerator[j] += weightFromGaussian * sharedX[i * DIMENSIONS + j];
}
denominator += weightFromGaussian;
}
}
// Update the position of the current point using numerator, denominator
for (int j = 0; j < DIMENSIONS; j++) {
float newY = numerator[j] / denominator;
moved[j] = newY - Yk[index * DIMENSIONS + j];
Yk[index * DIMENSIONS + j] = newY;
Ykplus1[index * DIMENSIONS + j] = newY;
}
}
}
// Function to verify results
int verifyResults(float *results, const char *refFileName, float tolerance) {
FILE *fRef = fopen(refFileName, "r");
if (fRef == NULL) {
fprintf(stderr, "Failed to open %s for validation.\n", refFileName);
return -1; // File opening error
}
float refValue;
int errors = 0;
for (int i = 0; i < NUM_OF_POINTS; i++) {
for (int j = 0; j < DIMENSIONS; j++) {
if (fscanf(fRef, "%f%*c", &refValue) != 1) {
fprintf(stderr, "Error reading validation file at point %d, dimension %d.\n", i, j);
fclose(fRef);
return -2; // Reading error
}
if (fabs(refValue - results[i * DIMENSIONS + j]) > tolerance) {
errors++;
}
}
}
fclose(fRef);
return errors;
}
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "Parameter needed\nExample: ./shared 0.5\n");
return 1;
}
float e = atof(argv[1]); // Convergence threshold from command line
float *arrayStatic, *arrayYk, *arrayYkplus1;
float *deviceArrayStatic, *deviceArrayYk, *deviceArrayYkplus1;
// Allocate memory for host arrays
size_t nBytes = NUM_OF_POINTS * DIMENSIONS * sizeof(float);
arrayStatic = (float *)malloc(nBytes);
arrayYk = (float *)malloc(nBytes);
arrayYkplus1 = (float *)malloc(nBytes);
// Allocate memory for device arrays
checkCuda(cudaMalloc((void **)&deviceArrayStatic, nBytes));
checkCuda(cudaMalloc((void **)&deviceArrayYk, nBytes));
checkCuda(cudaMalloc((void **)&deviceArrayYkplus1, nBytes));
// Open input file
FILE *myFile = fopen("input.txt", "r");
if (myFile == NULL) {
fprintf(stderr, "Failed to open input.txt.\n");
return 1;
}
// Read data from file
for (int i = 0; i < NUM_OF_POINTS; i++) {
for (int j = 0; j < DIMENSIONS; j++) {
if (fscanf(myFile, "%f%*c", &arrayStatic[i * DIMENSIONS + j]) != 1) {
fprintf(stderr, "Error reading file at point %d, dimension %d.\n", i, j);
fclose(myFile);
return 1;
}
arrayYk[i * DIMENSIONS + j] = arrayStatic[i * DIMENSIONS + j];
}
}
fclose(myFile);
// Transfer data from host to device
checkCuda(cudaMemcpy(deviceArrayStatic, arrayStatic, nBytes, cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(deviceArrayYk, arrayYk, nBytes, cudaMemcpyHostToDevice));
// GPU information
int deviceId;
cudaGetDevice(&deviceId);
cudaDeviceProp props;
checkCuda(cudaGetDeviceProperties(&props, deviceId));
int multiProcessorCount = props.multiProcessorCount;
int threadsPerBlock = BLOCK_SIZE;
int numberOfBlocks = 32 * multiProcessorCount;
struct timeval startwtime, endwtime;
gettimeofday(&startwtime, NULL);
// Launch kernel
shiftingFunction<<<numberOfBlocks, threadsPerBlock>>>(deviceArrayYkplus1, deviceArrayYk, deviceArrayStatic, e);
checkCuda(cudaDeviceSynchronize());
gettimeofday(&endwtime, NULL);
float seq_time = (float)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
printf("Wall clock time (using cuda - shared memory utilization) = %f ms\n", 1000 * seq_time);
// Copy results back to host
checkCuda(cudaMemcpy(arrayYk, deviceArrayYkplus1, nBytes, cudaMemcpyDeviceToHost));
// Write results to output.txt
FILE *f = fopen("output.txt", "w");
if (f == NULL) {
fprintf(stderr, "Error opening output file!\n");
return 1;
}
for (int i = 0; i < NUM_OF_POINTS; i++) {
for (int j = 0; j < DIMENSIONS; j++) {
fprintf(f, "%f ", arrayYk[i * DIMENSIONS + j]);
}
fprintf(f, "\n");
}
fclose(f);
// Validate the results by comparing with a reference file
int errors = verifyResults(arrayYk, "output_reference.txt", k * e);
printf("The number of errors is %d\n", errors);
free(arrayStatic);
free(arrayYk);
free(arrayYkplus1);
checkCuda(cudaFree(deviceArrayStatic));
checkCuda(cudaFree(deviceArrayYk));
checkCuda(cudaFree(deviceArrayYkplus1));
return 0;
}