basic gpu code

GongCode · Apr 17, 2019 · cf89e4d · cf89e4d
1 parent 36a3905
commit cf89e4d
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 28 deletions.
diff --git a/ece408_src/new-forward.cuh b/ece408_src/new-forward.cuh
@@ -4,12 +4,16 @@
 
 #include <mxnet/base.h>
 
+#define TILE_WIDTH 16
+#define CONSTANT_MASK_SIZE 3000
+
 namespace mxnet
 {
 namespace op
 {
 
-__global__ void forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K)
+__global__ void
+forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K)
 {
 
     /*
@@ -18,20 +22,47 @@ __global__ void forward_kernel(float *y, const float *x, const float *k, const i
     The goal here is to be correct AND fast.
     We have some nice #defs for you below to simplify indexing. Feel free to use them, or create your own.
     */
+    int n, m, c, h, w, p, q;
 
     const int H_out = H - K + 1;
     const int W_out = W - K + 1;
-    (void)H_out; // silence declared but never referenced warning. remove this line when you start working
-    (void)W_out; // silence declared but never referenced warning. remove this line when you start working
 
-// An example use of these macros:
-// float a = y4d(0,0,0,0)
-// y4d(0,0,0,0) = a
+    //helps us index the pointers
 #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
 #define x4d(i3, i2, i1, i0) x[(i3) * (C * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 #define k4d(i3, i2, i1, i0) k[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0]
 
-
+    n = blockIdx.x; //idx of images
+    m = blockIdx.y; //idx of features
+    const int H_Grid = ceil(H_out / (float)TILE_WIDTH);
+    const int W_Grid = ceil(W_out / (float)TILE_WIDTH); //round up
+    h = blockIdx.z / (H_Grid)*TILE_WIDTH + threadIdx.y; //y idx of output tile
+    w = blockIdx.z % (W_Grid)*TILE_WIDTH + threadIdx.x; //x idx of output tile
+
+    if (h >= H_out || w >= W_out)
+        return;
+
+    float total = 0.0;
+
+    //num of input feature maps
+    for (c = 0; c < C; c++)
+    {
+        //height of filter
+        for (p = 0; p < K; p++)
+        {
+            //width idx of filter
+            for (q = 0; q < K; q++)
+            {
+                total += x4d(n, c, h + p, w + q) * k4d(m, c, p, q);
+            }
+        }
+    }
+
+    y4d(n, m, h, w) = total;
+
+    // An example use of these macros:
+    // float a = y4d(0,0,0,0)
+    // y4d(0,0,0,0) = a
 
 #undef y4d
 #undef x4d
@@ -49,21 +80,37 @@ void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tenso
 
     // Use mxnet's CHECK_EQ to do assertions.
     // Remove this assertion when you do your implementation!
-    CHECK_EQ(0, 1) << "Remove this line and replace with your implementation";
+    //CHECK_EQ(0, 1) << "Remove this line and replace with your implementation";
 
     // Extract the tensor dimensions into B,M,C,H,W,K
-    // ...
+
+    const int B = x.shape_[0]; //number of output images
+    const int M = y.shape_[1]; //number of output feature maps
+    const int C = x.shape_[1]; //number of input feature maps
+    const int H = x.shape_[2]; //height of output elements
+    const int W = x.shape_[3]; //width of output element
+    const int K = w.shape_[3]; //dimension of the filters, width and height
 
     // Set the kernel dimensions
-    // dim3 gridDim(0);
-    // dim3 blockDim(0);
+    const int H_out = H - K + 1; // the output after removing the edges
+    const int W_out = W - K + 1;
+
+    int W_grid = ceil(W_out / (float)TILE_WIDTH); // number of horizontal tiles per output map
+    int H_grid = ceil(H_out / (float)TILE_WIDTH); // number of vertical tiles per output map
+    int Z = H_grid * W_grid;
+
+    printf("Num Output Feature Maps: %d ", M);
+    printf(" Num Input Feature Maps: %d ", C);
+    printf(" Filter Size: %d ", K);
+
+    dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
+    dim3 gridDim(B, M, Z); //num of output images, number of output feature maps, total tiles
 
     // Call the kernel
-    // forward_kernel<<<gridDim, blockDim, 0, s>>>(y.dptr_,x.dptr_,w.dptr_, B,M,C,H,W,K);
+    forward_kernel<<<gridDim, blockDim>>>(y.dptr_, x.dptr_, w.dptr_, B, M, C, H, W, K);
 
     // Use MSHADOW_CUDA_CALL to check for CUDA runtime errors.
     MSHADOW_CUDA_CALL(cudaDeviceSynchronize());
-
 }
 
 /* 
@@ -73,9 +120,9 @@ void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tenso
 template <typename gpu, typename DType>
 void forward(mshadow::Tensor<gpu, 4, DType> &y, const mshadow::Tensor<gpu, 4, DType> &x, const mshadow::Tensor<gpu, 4, DType> &w)
 {
-    CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
-}
-}
+    CHECK_EQ(0, 1) << "Remove this line and replace it with your implementation.";
 }
+} // namespace op
+} // namespace mxnet
 
 #endif
diff --git a/ece408_src/new-forward.h b/ece408_src/new-forward.h
@@ -9,7 +9,6 @@ namespace mxnet
 namespace op
 {
 
-
 template <typename cpu, typename DType>
 void forward(mshadow::Tensor<cpu, 4, DType> &y, const mshadow::Tensor<cpu, 4, DType> &x, const mshadow::Tensor<cpu, 4, DType> &k)
 {
@@ -21,23 +20,45 @@ void forward(mshadow::Tensor<cpu, 4, DType> &y, const mshadow::Tensor<cpu, 4, DT
     */
 
     const int B = x.shape_[0];
-    // const int M = y.shape_[1];
-    // const int C = x.shape_[1];
-    // const int H = x.shape_[2];
-    // const int W = x.shape_[3];
-    // const int K = k.shape_[3];
+    const int M = y.shape_[1];
+    const int C = x.shape_[1];
+    const int H = x.shape_[2];
+    const int W = x.shape_[3];
+    const int K = k.shape_[3];
 
-    for (int b = 0; b < B; ++b) {
+    for (int b = 0; b < B; ++b) //number of images
+    {
 
-        CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
+        // CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
 
         /* ... a bunch of nested loops later...
             y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q];
         */
+        for (int m = 0; m < M; ++m) //number of output feature maps
+        {
+            for (int h = 0; h < H; ++h) //height of output elements
+            {
+                for (int w = 0; w < W; ++w)//width of output element
+                {
+                    y[b][m][h][w] = 0; //sets output to be zero
+                    for (int c = 0; c < C; ++c) //num of input feature maps
+                    {
+                        for (int p = 0; p < K; ++p) //height of filter
+                        {
+                            for (int q = 0; q < K; ++q) //width of filter
+                            {
+                                y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q];
+                            }
+                        }
+                    }
+                }
+            }
+        }
     }
-
-}
-}
 }
+} // namespace op
+} // namespace mxnet
 
 #endif
+
+
diff --git a/rai_build.yml b/rai_build.yml
@@ -16,10 +16,11 @@ commands:
     - nice -n20 make -C /mxnet # build mxnet
     - pip2 install --user -e /mxnet/python   # install python bindings
     # <----Don't change the code above---->
-    - /usr/bin/time python m1.1.py # execute code
+    #- /usr/bin/time python m1.1.py # execute code
     # - /usr/bin/time python m1.2.py # execute code
     # - /usr/bin/time python m2.1.py # execute code
     # - /usr/bin/time python m3.1.py # execute code
     # - /usr/bin/time python m4.1.py # execute code
     # - nvprof -f -o timeline.nvvp python m1.2.py # execute code
     # - nvprof -f --analysis-metrics -o analysis.nvvp python m1.2.py # execute code
+    -  nvprof -o timeline.nvprof python m3.1.py