-
Notifications
You must be signed in to change notification settings - Fork 4
/
mp5.cu
132 lines (101 loc) · 4.57 KB
/
mp5.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
#ifndef BLOCK_SIZE
# define BLOCK_SIZE 256
#endif
#define HALF_BLOCK_SIZE BLOCK_SIZE << 1
#define MEM_SIZE(size) \
size * sizeof(float)
__global__ void post_scan(float* in, float* add, int len) {
unsigned int t = threadIdx.x;
unsigned int start = 2 * blockIdx.x * BLOCK_SIZE;
if (blockIdx.x) {
if (start + t < len) in[start + t] += add[blockIdx.x - 1];
if (start + BLOCK_SIZE + t < len) in[start + BLOCK_SIZE + t] += add[blockIdx.x - 1];
}
}
__global__ void scan(float* in, float* out, float* post, int len) {
__shared__ float scan_array[HALF_BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2 * blockIdx.x * BLOCK_SIZE;
int index;
if (start + t < len) scan_array[t] = in[start + t];
else scan_array[t] = 0;
if (start + BLOCK_SIZE + t < len) scan_array[BLOCK_SIZE + t] = in[start + BLOCK_SIZE + t];
else scan_array[BLOCK_SIZE + t] = 0;
__syncthreads();
for (unsigned int stride = 1; stride <= BLOCK_SIZE; stride <<= 1) {
index = (t + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE) scan_array[index] += scan_array[index - stride];
__syncthreads();
}
for (unsigned int stride = BLOCK_SIZE >> 1; stride; stride >>= 1) {
index = (t + 1) * stride * 2 - 1;
if (index + stride < 2 * BLOCK_SIZE) scan_array[index + stride] += scan_array[index];
__syncthreads();
}
if (start + t < len) out[start + t] = scan_array[t];
if (start + BLOCK_SIZE + t < len) out[start + BLOCK_SIZE + t] = scan_array[BLOCK_SIZE + t];
if (post && t == 0) post[blockIdx.x] = scan_array[2 * BLOCK_SIZE - 1];
}
int main(int argc, char ** argv) {
float* hostInput;
float* hostOutput;
float* deviceInput;
float* deviceOutput;
float* deviceScanFirstPass;
float* deviceScanSecondPass;
int numElements, numBlocks;
size_t numElementsMemSize;
int halfBlockSize = BLOCK_SIZE << 1;
wbArg_t args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
numElementsMemSize = MEM_SIZE(numElements);
numBlocks = ceil((float)numElements/halfBlockSize);
wbCheck(cudaHostAlloc(&hostOutput, numElementsMemSize, cudaHostAllocDefault));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ", numElements);
wbLog(TRACE, "The number of blocks is ", numBlocks);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck(cudaMalloc(&deviceInput, numElementsMemSize));
wbCheck(cudaMalloc(&deviceOutput, numElementsMemSize));
wbCheck(cudaMalloc(&deviceScanFirstPass, MEM_SIZE(halfBlockSize)));
wbCheck(cudaMalloc(&deviceScanSecondPass, MEM_SIZE(halfBlockSize)));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Clearing output memory.");
wbCheck(cudaMemset(deviceOutput, 0, numElementsMemSize));
wbTime_stop(GPU, "Clearing output memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck(cudaMemcpy(deviceInput, hostInput, numElementsMemSize, cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU.");
dim3 grid(numBlocks);
dim3 threads(BLOCK_SIZE);
wbTime_start(Compute, "Performing CUDA computation");
scan<<<grid, threads>>>(deviceInput, deviceOutput, deviceScanFirstPass, numElements);
cudaDeviceSynchronize();
scan<<<1, threads>>>(deviceScanFirstPass, deviceScanSecondPass, NULL, halfBlockSize);
cudaDeviceSynchronize();
post_scan<<<grid, threads>>>(deviceOutput, deviceScanSecondPass, numElements);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElementsMemSize, cudaMemcpyDeviceToHost));
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
wbCheck(cudaFree(deviceInput));
wbCheck(cudaFree(deviceOutput));
wbCheck(cudaFree(deviceScanFirstPass));
wbCheck(cudaFree(deviceScanSecondPass));
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostOutput, numElements);
free(hostInput);
wbCheck(cudaFreeHost(hostOutput));
return 0;
}