Skip to content

Commit

Permalink
basic gpu code
Browse files Browse the repository at this point in the history
  • Loading branch information
GongCode committed Apr 17, 2019
1 parent 36a3905 commit cf89e4d
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 28 deletions.
79 changes: 63 additions & 16 deletions ece408_src/new-forward.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

#include <mxnet/base.h>

#define TILE_WIDTH 16
#define CONSTANT_MASK_SIZE 3000

namespace mxnet
{
namespace op
{

__global__ void forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K)
__global__ void
forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K)
{

/*
Expand All @@ -18,20 +22,47 @@ __global__ void forward_kernel(float *y, const float *x, const float *k, const i
The goal here is to be correct AND fast.
We have some nice #defs for you below to simplify indexing. Feel free to use them, or create your own.
*/
int n, m, c, h, w, p, q;

const int H_out = H - K + 1;
const int W_out = W - K + 1;
(void)H_out; // silence declared but never referenced warning. remove this line when you start working
(void)W_out; // silence declared but never referenced warning. remove this line when you start working

// An example use of these macros:
// float a = y4d(0,0,0,0)
// y4d(0,0,0,0) = a
//helps us index the pointers
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (C * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
#define k4d(i3, i2, i1, i0) k[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0]


n = blockIdx.x; //idx of images
m = blockIdx.y; //idx of features
const int H_Grid = ceil(H_out / (float)TILE_WIDTH);
const int W_Grid = ceil(W_out / (float)TILE_WIDTH); //round up
h = blockIdx.z / (H_Grid)*TILE_WIDTH + threadIdx.y; //y idx of output tile
w = blockIdx.z % (W_Grid)*TILE_WIDTH + threadIdx.x; //x idx of output tile

if (h >= H_out || w >= W_out)
return;

float total = 0.0;

//num of input feature maps
for (c = 0; c < C; c++)
{
//height of filter
for (p = 0; p < K; p++)
{
//width idx of filter
for (q = 0; q < K; q++)
{
total += x4d(n, c, h + p, w + q) * k4d(m, c, p, q);
}
}
}

y4d(n, m, h, w) = total;

// An example use of these macros:
// float a = y4d(0,0,0,0)
// y4d(0,0,0,0) = a

#undef y4d
#undef x4d
Expand All @@ -49,21 +80,37 @@ void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tenso

// Use mxnet's CHECK_EQ to do assertions.
// Remove this assertion when you do your implementation!
CHECK_EQ(0, 1) << "Remove this line and replace with your implementation";
//CHECK_EQ(0, 1) << "Remove this line and replace with your implementation";

// Extract the tensor dimensions into B,M,C,H,W,K
// ...

const int B = x.shape_[0]; //number of output images
const int M = y.shape_[1]; //number of output feature maps
const int C = x.shape_[1]; //number of input feature maps
const int H = x.shape_[2]; //height of output elements
const int W = x.shape_[3]; //width of output element
const int K = w.shape_[3]; //dimension of the filters, width and height

// Set the kernel dimensions
// dim3 gridDim(0);
// dim3 blockDim(0);
const int H_out = H - K + 1; // the output after removing the edges
const int W_out = W - K + 1;

int W_grid = ceil(W_out / (float)TILE_WIDTH); // number of horizontal tiles per output map
int H_grid = ceil(H_out / (float)TILE_WIDTH); // number of vertical tiles per output map
int Z = H_grid * W_grid;

printf("Num Output Feature Maps: %d ", M);
printf(" Num Input Feature Maps: %d ", C);
printf(" Filter Size: %d ", K);

dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
dim3 gridDim(B, M, Z); //num of output images, number of output feature maps, total tiles

// Call the kernel
// forward_kernel<<<gridDim, blockDim, 0, s>>>(y.dptr_,x.dptr_,w.dptr_, B,M,C,H,W,K);
forward_kernel<<<gridDim, blockDim>>>(y.dptr_, x.dptr_, w.dptr_, B, M, C, H, W, K);

// Use MSHADOW_CUDA_CALL to check for CUDA runtime errors.
MSHADOW_CUDA_CALL(cudaDeviceSynchronize());

}

/*
Expand All @@ -73,9 +120,9 @@ void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tenso
template <typename gpu, typename DType>
void forward(mshadow::Tensor<gpu, 4, DType> &y, const mshadow::Tensor<gpu, 4, DType> &x, const mshadow::Tensor<gpu, 4, DType> &w)
{
CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
}
}
CHECK_EQ(0, 1) << "Remove this line and replace it with your implementation.";
}
} // namespace op
} // namespace mxnet

#endif
43 changes: 32 additions & 11 deletions ece408_src/new-forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ namespace mxnet
namespace op
{


template <typename cpu, typename DType>
void forward(mshadow::Tensor<cpu, 4, DType> &y, const mshadow::Tensor<cpu, 4, DType> &x, const mshadow::Tensor<cpu, 4, DType> &k)
{
Expand All @@ -21,23 +20,45 @@ void forward(mshadow::Tensor<cpu, 4, DType> &y, const mshadow::Tensor<cpu, 4, DT
*/

const int B = x.shape_[0];
// const int M = y.shape_[1];
// const int C = x.shape_[1];
// const int H = x.shape_[2];
// const int W = x.shape_[3];
// const int K = k.shape_[3];
const int M = y.shape_[1];
const int C = x.shape_[1];
const int H = x.shape_[2];
const int W = x.shape_[3];
const int K = k.shape_[3];

for (int b = 0; b < B; ++b) {
for (int b = 0; b < B; ++b) //number of images
{

CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
// CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";

/* ... a bunch of nested loops later...
y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q];
*/
for (int m = 0; m < M; ++m) //number of output feature maps
{
for (int h = 0; h < H; ++h) //height of output elements
{
for (int w = 0; w < W; ++w)//width of output element
{
y[b][m][h][w] = 0; //sets output to be zero
for (int c = 0; c < C; ++c) //num of input feature maps
{
for (int p = 0; p < K; ++p) //height of filter
{
for (int q = 0; q < K; ++q) //width of filter
{
y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q];
}
}
}
}
}
}
}

}
}
}
} // namespace op
} // namespace mxnet

#endif


3 changes: 2 additions & 1 deletion rai_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ commands:
- nice -n20 make -C /mxnet # build mxnet
- pip2 install --user -e /mxnet/python # install python bindings
# <----Don't change the code above---->
- /usr/bin/time python m1.1.py # execute code
#- /usr/bin/time python m1.1.py # execute code
# - /usr/bin/time python m1.2.py # execute code
# - /usr/bin/time python m2.1.py # execute code
# - /usr/bin/time python m3.1.py # execute code
# - /usr/bin/time python m4.1.py # execute code
# - nvprof -f -o timeline.nvvp python m1.2.py # execute code
# - nvprof -f --analysis-metrics -o analysis.nvvp python m1.2.py # execute code
- nvprof -o timeline.nvprof python m3.1.py

0 comments on commit cf89e4d

Please sign in to comment.