diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 753f57ccc25..a4e06d4564e 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -440,6 +440,8 @@ class DLL_EXPORT Layer {
   void Lock();
   /** Unlock forward_mutex_ if this layer is shared */
   void Unlock();
+  
+  void layer_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); 
 
   DISABLE_COPY_AND_ASSIGN(Layer);
 };  // class Layer
@@ -448,8 +450,43 @@ class DLL_EXPORT Layer {
 // gpu specific implementations instead, and should not change these
 // functions.
 template <typename Dtype>
-Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top);
+inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Lock during forward to ensure sequential forward
+  Lock();
+  Dtype loss = 0;
+  Reshape(bottom, top);
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    Forward_cpu(bottom, top);
+    for (int top_id = 0; top_id < top.size(); ++top_id) {
+      if (!this->loss(top_id)) { continue; }
+      const int count = top[top_id]->count();
+      const Dtype* data = top[top_id]->cpu_data();
+      const Dtype* loss_weights = top[top_id]->cpu_diff();
+      loss += caffe_cpu_dot(count, data, loss_weights);
+    }
+    break;
+  case Caffe::GPU:
+    Forward_gpu(bottom, top);
+#ifndef CPU_ONLY
+    for (int top_id = 0; top_id < top.size(); ++top_id) {
+      if (!this->loss(top_id)) { continue; }
+      const int count = top[top_id]->count();
+      const Dtype* data = top[top_id]->gpu_data();
+      const Dtype* loss_weights = top[top_id]->gpu_diff();
+      Dtype blob_loss = 0;
+      layer_gpu_dot(count, data, loss_weights, &blob_loss);
+      loss += blob_loss;
+    }
+#endif
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
+  Unlock();
+  return loss;
+}
 
 template <typename Dtype>
 inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 8b88a5aa801..c5b053c259f 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -23,44 +23,11 @@ void Layer<Dtype>::Unlock() {
 }
 
 template <typename Dtype>
-Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  // Lock during forward to ensure sequential forward
-  Lock();
-  Dtype loss = 0;
-  Reshape(bottom, top);
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    Forward_cpu(bottom, top);
-    for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
-      const int count = top[top_id]->count();
-      const Dtype* data = top[top_id]->cpu_data();
-      const Dtype* loss_weights = top[top_id]->cpu_diff();
-      loss += caffe_cpu_dot(count, data, loss_weights);
-    }
-    break;
-  case Caffe::GPU:
-    Forward_gpu(bottom, top);
-#ifndef CPU_ONLY
-    for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
-      const int count = top[top_id]->count();
-      const Dtype* data = top[top_id]->gpu_data();
-      const Dtype* loss_weights = top[top_id]->gpu_diff();
-      Dtype blob_loss = 0;
-      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
-      loss += blob_loss;
-    }
-#endif
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
-  }
-  Unlock();
-  return loss;
+void Layer<Dtype>::layer_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out) {
+  caffe_gpu_dot(n, x, y, out);
 }
 
+
 INSTANTIATE_CLASS(Layer);
 
 }  // namespace caffe