-
Notifications
You must be signed in to change notification settings - Fork 2
/
conv4D_impl_GPU.cu
169 lines (142 loc) · 6.37 KB
/
conv4D_impl_GPU.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#include "conv4D_data_structures.h"
#include "device_launch_parameters.h"
extern "C" {
#include "conv4D_impl.h"
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
}
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
__device__ input_feature_map_t gpu_input;
__device__ output_feature_map_t gpu_output;
__device__ conv4d_layer_t gpu_layer;
// function to add the elements of two arrays
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
//Convolve the arrays with the GPU
__global__
void conv4d_gpu_convolve()
{
// int index = blockIdx.x * blockDim.x + threadIdx.x;
// int stride = blockDim.x * gridDim.x;
//printf("Thread ID: (%d,%d,%d)\tBlock ID: (%d,%d,%d)\n", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z);
size_t q_init = blockIdx.x*blockDim.x+threadIdx.x;
size_t p_init = blockIdx.y*blockDim.y+threadIdx.y;
size_t m_init = blockIdx.z*blockDim.z+threadIdx.z;
size_t q_stride = gridDim.x * blockDim.x;
size_t p_stride = gridDim.y * blockDim.y;
size_t m_stride = gridDim.z * blockDim.z;
//Begin convolution
for (size_t n = 0; n < OUTPUT_BATCHES; n++)
for (size_t q = q_init; q < OUTPUT_HEIGHT; q+=q_stride)
for (size_t p = p_init; p < OUTPUT_WIDTH; p+=p_stride){
for (size_t s = 0; s < LAYER_HEIGHT; s++)
for (size_t r = 0; r < LAYER_WIDTH; r++)
for (size_t c = 0; c < INPUT_CHANNELS; c++)
for (size_t m = m_init; m < OUTPUT_CHANNELS; m+=m_stride){
gpu_output.data[n][q][p][m] += gpu_input.data[n][q*LAYER_STRIDE+s][p*LAYER_STRIDE+r][c] * gpu_layer.weights[s][r][c][m];
}
for (size_t m = m_init; m < OUTPUT_CHANNELS; m+=m_stride){
gpu_output.data[n][q][p][m] += gpu_layer.bias[m];
if(gpu_output.data[n][q][p][m] < 0) gpu_output.data[n][q][p][m] = 0;
}
}
}
/**
* @brief Updates the GPU versions of the input and layer from their corresponding CPU versions
*
*/
void cuda_var_update(){
cudaMemcpyToSymbol(gpu_input, &input, sizeof(input_feature_map_t), 0, cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpyToSymbol(gpu_layer, &layer, sizeof(conv4d_layer_t), 0, cudaMemcpyHostToDevice);
cudaCheckError();
}
/**
* @brief Updates the CPU version of the output from its corresponding GPU version
*
*/
void host_var_update(){
cudaMemcpyFromSymbol(&output, gpu_output, sizeof(output_feature_map_t), 0, cudaMemcpyDeviceToHost);
cudaCheckError();
}
/**
* @brief
*
* @param block_size
*/
void conv4d_convolve_CUDA_discrete(int block_size, int grid_size){
if(block_size<=0) CONV4D_DATA_STRUCTURE_RUNTIME_ERROR("GPU block size expected to be larger than 0. Got %d\n", block_size);
if(grid_size<=0) CONV4D_DATA_STRUCTURE_RUNTIME_ERROR("GPU grid size expected to be larger than 0. Got %d\n", grid_size);
void* gpu_output_addr;
cudaGetSymbolAddress<output_feature_map_t>(&gpu_output_addr, gpu_output);
cudaCheckError()
cudaMemset(gpu_output_addr, 0, sizeof(output_feature_map_t));
cudaCheckError();
dim3 dimBlock(block_size, block_size, block_size);
dim3 dimGrid(grid_size, grid_size, grid_size);
cudaCheckError();
// dim3 dimBlock(1, 1);
// dim3 dimGrid(1, 1);
conv4d_gpu_convolve<<<dimGrid,dimBlock>>>();
cudaCheckError();
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
cudaCheckError();
host_var_update();
//Reset memory
// memset(&output, 0, sizeof(output));
// for(int bx = 0; bx < grid_size; bx++){
// for(int by = 0; by < grid_size; by++){
// for(int bz = 0; bz < grid_size; bz++){
// for(int tx = 0; tx < block_size; tx++){
// for(int ty = 0; ty < block_size; ty++){
// for(int tz = 0; tz < block_size; tz++){
// size_t q_init = bx*block_size+tx;
// size_t p_init = by*block_size+ty;
// size_t m_init = bz*block_size+tz;
// size_t q_stride = block_size * grid_size;
// size_t p_stride = block_size * grid_size;
// size_t m_stride = block_size * grid_size;
// //Begin convolution
// for (size_t n = 0; n < OUTPUT_BATCHES; n++)
// for (size_t q = q_init; q < OUTPUT_HEIGHT; q+=q_stride)
// for (size_t p = p_init; p < OUTPUT_WIDTH; p+=p_stride){
// for (size_t s = 0; s < LAYER_HEIGHT; s++)
// for (size_t r = 0; r < LAYER_WIDTH; r++)
// for (size_t c = 0; c < INPUT_CHANNELS; c++)
// for (size_t m = m_init; m < OUTPUT_CHANNELS; m+=m_stride){
// output.data[n][q][p][m] += input.data[n][q*LAYER_STRIDE+s][p*LAYER_STRIDE+r][c] * layer.weights[s][r][c][m];
// }
// for (size_t m = m_init; m < OUTPUT_CHANNELS; m+=m_stride){
// output.data[n][q][p][m] += layer.bias[m];
// if(output.data[n][q][p][m] < 0) output.data[n][q][p][m] = 0;
// }
// }
// }
// }
// }
// }
// }
// }
}
void conv4d_convolve_CUDA_discrete_rewrite_gpu_data(int block_size, int grid_size){
cudaCheckError();
cuda_var_update();
cudaCheckError();
conv4d_convolve_CUDA_discrete(block_size, grid_size);
cudaCheckError();
}