-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda.cu
114 lines (93 loc) · 3.32 KB
/
cuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "hw8.h"
#include <stdio.h>
__global__ void addCalculateKernel(const int *image, int *dest,
unsigned int part_size, int arr_size , int mod) {
int tid = threadIdx.x;
int image_start = part_size * tid;
int image_end = image_start + part_size + ((tid == THREADS -1)? mod :0);
// if (image_end > arr_size)
// image_end = arr_size;
int dest_start = arr_size * tid;
int dest_end = dest_start + arr_size;
// zero srtarting counters
for (int j = dest_start; j < dest_end; j++) {
dest[j] = 0;
}
for (int j = image_start; j < image_end; j++)
dest[dest_start + image[j]]++;
}
__global__ void addMergeKernel(int *histogram, int *temp_arrays, int arr_size) {
int tid = threadIdx.x;
// zero starting counters
for (int i = 0; i < RANGE_SIZE; i++) {
histogram[i] = 0;
}
// merge results. each thread summarize one cell in each temp array
//for (int i = 0; i < RANGE_SIZE; i++)
for (int i = 0; i < THREADS; i++)
histogram[tid] += temp_arrays[arr_size * i + tid];
}
int* calculateHistogramm(int *image, unsigned int size, int arr_size) {
int *dev_image = 0;
int *dev_dest_hist = 0;
int *dev_histogram = 0;
int *histogram;
myIntArrCalloc(&histogram, RANGE_SIZE);
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
// Allocate GPU buffers for three vectors (two input, one output).
cudaStatus = cudaMalloc((void**) &dev_image, size * sizeof(int));
cudaStatus = cudaMalloc((void**) &dev_histogram, RANGE_SIZE * sizeof(int));
cudaStatus = cudaMalloc((void**) &dev_dest_hist,
arr_size * sizeof(int) * (arr_size / 4));
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_image, image, size * sizeof(int),
cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addCalculateKernel <<<1, THREADS>>>(dev_image, dev_dest_hist, size/THREADS, arr_size , size%THREADS);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n",
cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaDeviceSynchronize returned error code %d after launching addCalculateKernel!\n",
cudaStatus);
goto Error;
}
addMergeKernel <<<1, THREADS>>>(dev_histogram, dev_dest_hist , arr_size);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n",
cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaDeviceSynchronize returned error code %d after launching addCalculateKernel!\n",
cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(histogram, dev_histogram, RANGE_SIZE * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error: cudaFree(dev_image);
cudaFree(dev_histogram);
cudaFree(dev_dest_hist);
return histogram;
}