diff --git a/include/singa/neuralnet/neuron_layer.h b/include/singa/neuralnet/neuron_layer.h
index e6f0fd5c9f..f1bd4432e4 100644
--- a/include/singa/neuralnet/neuron_layer.h
+++ b/include/singa/neuralnet/neuron_layer.h
@@ -112,6 +112,7 @@ class DropoutLayer : public NeuronLayer {
    */
   Blob<float> mask_;
 };
+
 /**
  * This layer is dummy and do no real work.
  * It is used for testing purpose only.
@@ -126,7 +127,7 @@ class DummyLayer: public NeuronLayer {
   void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
   void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  void Feed(int batchsize, vector<float>& data, vector<int>& aux_data);
+  void Feed(vector<int> shape, vector<float>* data, int op);
   Layer* ToLayer() { return this;}
 
  private:
@@ -278,7 +279,7 @@ class PoolingLayer : public NeuronLayer {
   int kernel_x_, pad_x_, stride_x_;
   int kernel_y_, pad_y_, stride_y_;
   int batchsize_, channels_, height_, width_, pooled_height_, pooled_width_;
-  PoolingProto_PoolMethod pool_;
+  PoolMethod pool_;
 };
 /**
  * Use book-keeping for BP following Caffe's pooling implementation
diff --git a/src/neuralnet/neuron_layer/dummy.cc b/src/neuralnet/neuron_layer/dummy.cc
index 9796407561..a3bec97b38 100644
--- a/src/neuralnet/neuron_layer/dummy.cc
+++ b/src/neuralnet/neuron_layer/dummy.cc
@@ -78,25 +78,53 @@ void DummyLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
     Copy(grad_, srclayers[0]->mutable_grad(this));
 }
 
-void DummyLayer::Feed(int batchsize, vector<float>& data, vector<int>& aux_data){
+void DummyLayer::Feed(vector<int> shape, vector<float>* data, int op){
 
-    batchsize_ = batchsize;
-    // input data
-    if (data.size() > 0) {
-      int size = data.size();
+    //batchsize_ = batchsize;
+    batchsize_ = shape[0];
+    // dataset
+    if (op == 0) {
+      /*
+      size_t hdim = 1;
+      for (size_t i = 1; i < shape.size(); ++i) 
+          hdim *= shape[i];
+      //data_.Reshape({batchsize, (int)hdim});
+      //shape.insert(shape.begin(),batchsize);
+      data_.Reshape(shape);
+      */
+      //reshape data
+      data_.Reshape(shape);
+      CHECK_EQ(data_.count(), data->size());
+
+      int size = data->size();
       float* ptr = data_.mutable_cpu_data();
       for (int i = 0; i< size; i++) { 
-          ptr[i] = data.at(i);
+          ptr[i] = data->at(i);
       }
     }
-    // auxiliary data, e.g., label
-    if (aux_data.size() > 0) {
+    // label
+    else {
       aux_data_.resize(batchsize_);
       for (int i = 0; i< batchsize_; i++) {
-          aux_data_[i] = static_cast<int>(aux_data.at(i));
+          aux_data_[i] = static_cast<int>(data->at(i));
       }
     }
+
+    return;
+
+    /* Wenfeng's input
+    batchsize_ = batchsize;
+    shape.insert(shape.begin(),batchsize);
+    data_.Reshape(shape);
+
+    int size = data_.count() / batchsize_;
+    CHECK_EQ(size, data->size());
+    float* ptr = data_.mutable_cpu_data();
+    for (int i = 0; i< size; i++)
+	      ptr[i] = data->at(i);
+
     return;
+    */
 }
 
 }  // namespace singa
diff --git a/src/neuralnet/neuron_layer/pooling.cc b/src/neuralnet/neuron_layer/pooling.cc
index 4eda2e4097..07a88d9c65 100644
--- a/src/neuralnet/neuron_layer/pooling.cc
+++ b/src/neuralnet/neuron_layer/pooling.cc
@@ -58,8 +58,8 @@ void PoolingLayer::Setup(const LayerProto& conf,
   }
 
   pool_ = conf.pooling_conf().pool();
-  CHECK(pool_ == PoolingProto_PoolMethod_AVG
-        || pool_ == PoolingProto_PoolMethod_MAX)
+  CHECK(pool_ == PoolMethod::AVG
+        || pool_ == PoolMethod::MAX)
         << "Padding implemented only for average and max pooling.";
   const auto& srcshape = srclayers[0]->data(this).shape();
   int dim = srcshape.size();
@@ -83,9 +83,9 @@ void PoolingLayer::Setup(const LayerProto& conf,
 void PoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto data = Tensor4(&data_);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
+  if (pool_ == PoolMethod::MAX)
     data = expr::pool<red::maximum>(src, kernel_x_, stride_x_);
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
+  else if (pool_ == PoolMethod::AVG)
     data = expr::pool<red::sum>(src, kernel_x_, stride_x_)
       * (1.0f / (kernel_x_ * kernel_x_));
 }
@@ -99,9 +99,9 @@ void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
   auto data = Tensor4(&data_);
   auto grad = Tensor4(&grad_);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
+  if (pool_ == PoolMethod::MAX)
     gsrc = expr::unpool<red::maximum>(src, data, grad, kernel_x_, stride_x_);
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
+  else if (pool_ == PoolMethod::AVG)
     gsrc = expr::unpool<red::sum>(src, data, grad, kernel_x_, stride_x_)
            * (1.0f / (kernel_x_ * kernel_x_));
 }
@@ -111,16 +111,16 @@ void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
 void CPoolingLayer::Setup(const LayerProto& conf,
     const vector<Layer*>& srclayers) {
   PoolingLayer::Setup(conf, srclayers);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
+  if (pool_ == PoolMethod::MAX)
       mask_.ReshapeLike(data_);
 }
 void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (pool_ == PoolingProto_PoolMethod_MAX)
+  if (pool_ == PoolMethod::MAX)
     ForwardMaxPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
         batchsize_, channels_, height_, width_, kernel_y_, kernel_x_,
         pad_y_, pad_y_, stride_y_, stride_x_,
         data_.mutable_cpu_data(), mask_.mutable_cpu_data());
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
+  else if (pool_ == PoolMethod::AVG)
     ForwardAvgPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
         batchsize_, channels_, height_, width_, kernel_y_, kernel_x_,
         pad_y_, pad_x_, stride_y_, stride_y_, data_.mutable_cpu_data());
@@ -129,12 +129,12 @@ void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
 }
 
 void CPoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  if (pool_ == PoolingProto_PoolMethod_MAX)
+  if (pool_ == PoolMethod::MAX)
     BackwardMaxPooling(grad_.cpu_data(), mask_.cpu_data(), batchsize_,
         channels_, height_, width_, kernel_y_, kernel_x_, pad_y_, pad_x_,
         stride_y_, stride_y_,
         srclayers[0]->mutable_grad(this)->mutable_cpu_data());
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
+  else if (pool_ == PoolMethod::AVG)
     BackwardAvgPooling(grad_.cpu_data(), batchsize_,
         channels_, height_, width_, kernel_y_, kernel_x_, pad_y_, pad_x_,
         stride_y_, stride_x_,
diff --git a/src/proto/job.proto b/src/proto/job.proto
index b4aa9714d7..0e8ef9f9d5 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -522,15 +522,15 @@ message LRNProto {
   // offset
   optional float knorm = 34 [default = 1.0];
 }
-
+enum PoolMethod {
+  MAX = 0;
+  AVG = 1;
+}
+ 
 message PoolingProto {
   // The kernel size (square)
   optional int32 kernel= 1 [default = 3];
-  enum PoolMethod {
-    MAX = 0;
-    AVG = 1;
-  }
-  // The pooling method
+ // The pooling method
   optional PoolMethod pool = 30 [default = MAX];
   // The padding size
   optional uint32 pad = 31 [default = 0];
diff --git a/thirdparty/install.sh b/thirdparty/install.sh
index bc3c159934..9dcb274884 100755
--- a/thirdparty/install.sh
+++ b/thirdparty/install.sh
@@ -256,19 +256,19 @@ function install_protobuf()
 			echo "install protobuf in $1";
 			./configure --prefix=$1;
 			make && make install;
-			#cd python;
-			#python setup.py build;
-			#python setup.py install --prefix=$1;
-			#cd ..;
+			cd python;
+			python setup.py build;
+			python setup.py install --prefix=$1;
+			cd ..;
 		elif [ $# == 0 ]
 		then
 			echo "install protobuf in default path";
 			./configure;
 			make && sudo make install;
-			#cd python;
-			#python setup.py build;
-			#sudo python setup.py install;
-			#cd ..;
+			cd python;
+			python setup.py build;
+			sudo python setup.py install;
+			cd ..;
 		else
 			echo "wrong commands";
 	fi
diff --git a/tool/dlaas/main.py b/tool/dlaas/main.py
new file mode 100644
index 0000000000..e05d176da3
--- /dev/null
+++ b/tool/dlaas/main.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+
+#/************************************************************
+#*
+#* Licensed to the Apache Software Foundation (ASF) under one
+#* or more contributor license agreements.  See the NOTICE file
+#* distributed with this work for additional information
+#* regarding copyright ownership.  The ASF licenses this file
+#* to you under the Apache License, Version 2.0 (the
+#* "License"); you may not use this file except in compliance
+#* with the License.  You may obtain a copy of the License at
+#*
+#*   http://www.apache.org/licenses/LICENSE-2.0
+#*
+#* Unless required by applicable law or agreed to in writing,
+#* software distributed under the License is distributed on an
+#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#* KIND, either express or implied.  See the License for the
+#* specific language governing permissions and limitations
+#* under the License.
+#*
+#*************************************************************/
+
+import os, sys
+import numpy as np
+
+current_path_ = os.path.dirname(__file__)
+singa_root_=os.path.abspath(os.path.join(current_path_,'../..'))
+sys.path.append(os.path.join(singa_root_,'thirdparty','protobuf-2.6.0','python'))
+sys.path.append(os.path.join(singa_root_,'tool','python'))
+
+from model import neuralnet, updater
+from singa.driver import Driver
+from singa.layer import *
+from singa.model import save_model_parameter, load_model_parameter 
+from singa.utils.utility import swap32
+
+from PIL import Image
+import glob,random, shutil, time
+from flask import Flask, request, redirect, url_for, Response
+from singa.utils import kvstore, imgtool
+app = Flask(__name__)
+
+def train(batchsize,disp_freq,check_freq,train_step,workspace,checkpoint=None):
+    print '[Layer registration/declaration]'
+    # TODO change layer registration methods
+    d = Driver()
+    d.Init(sys.argv)
+
+    print '[Start training]'
+
+    #if need to load checkpoint
+    if checkpoint:
+        load_model_parameter(workspace+checkpoint, neuralnet, batchsize)
+   
+    for i in range(0,train_step):       
+    
+        for h in range(len(neuralnet)):
+            #Fetch data for input layer
+            if neuralnet[h].layer.type==kDummy:
+                neuralnet[h].FetchData(batchsize)
+            else:
+                neuralnet[h].ComputeFeature()
+    
+        neuralnet[h].ComputeGradient(i+1, updater)
+    
+        if (i+1)%disp_freq == 0:
+            print '  Step {:>3}: '.format(i+1),
+            neuralnet[h].display()
+    
+        if (i+1)%check_freq == 0:   
+            save_model_parameter(i+1, workspace, neuralnet)
+
+
+    print '[Finish training]'
+    
+
+def product(workspace,checkpoint):
+    
+    print '[Layer registration/declaration]'
+    # TODO change layer registration methods
+    d = Driver()
+    d.Init(sys.argv)
+
+    load_model_parameter(workspace+checkpoint, neuralnet,1) 
+   
+    app.debug = True
+    app.run(host='0.0.0.0', port=80)
+
+
+@app.route("/")
+def index():
+    return "Hello World! This is SINGA DLAAS! Please send post request with image=file to '/predict' "
+
+def allowed_file(filename):
+    allowd_extensions_ = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1] in allowd_extensions_
+
+@app.route('/predict', methods=['POST'])
+def predict():
+    size_=(32,32)
+    pixel_length_=3*size_[0]*size_[1]
+    label_num_=10
+    if request.method == 'POST':
+        file = request.files['image']
+        if file and allowed_file(file.filename):
+            im = Image.open(file).convert("RGB")
+            im = imgtool.resize_to_center(im,size_)
+            pixel = floatVector(pixel_length_) 
+            byteArray = imgtool.toBin(im,size_)
+            data = np.frombuffer(byteArray, dtype=np.uint8)
+            data = data.reshape(1, pixel_length_)
+            #dummy data Layer
+            shape = intVector(4)
+            shape[0]=1
+            shape[1]=3
+            shape[2]=size_[0]
+            shape[3]=size_[1]
+            
+            for h in range(len(neuralnet)):
+            #Fetch data for input layer
+                if neuralnet[h].is_datalayer:
+                    if not neuralnet[h].is_label:
+                        neuralnet[h].Feed(data,3)
+                    else:
+                        neuralnet[h].FetchData(1)
+                else:
+                    neuralnet[h].ComputeFeature()
+    
+            #get result
+            #data = neuralnet[-1].get_singalayer().data(neuralnet[-1].get_singalayer())    
+            #prop =floatArray_frompointer(data.mutable_cpu_data())
+            prop = neuralnet[-1].GetData()
+            print prop
+            result=[]
+            for i in range(label_num_):
+                result.append((i,prop[i])) 
+        
+            result.sort(key=lambda tup: tup[1], reverse=True)
+            print result
+            response="" 
+            for r in result:
+                response+=str(r[0])+":"+str(r[1])+"\n" 
+            resp = Response(response)
+            resp.headers['Access-Control-Allow-Origin'] = '*'
+            return resp 
+    return "error"
+
+    
+if __name__=='__main__':
+   
+    if sys.argv[1]=="train":
+        if len(sys.argv) < 6:
+            print "argv should be more than 6"
+            exit()
+        if len(sys.argv) > 6:
+            checkpoint = sys.argv[6]
+        else:
+            checkpoint = None
+        #training
+        train(
+          batchsize = int(sys.argv[2]), 
+          disp_freq = int(sys.argv[3]),
+          check_freq = int(sys.argv[4]), 
+          train_step = int(sys.argv[5]),
+          workspace = '/workspace',
+          checkpoint = checkpoint,
+          )
+    else:
+        if len(sys.argv) < 3:
+            print "argv should be more than 2"
+            exit()
+        checkpoint = sys.argv[2]
+        product(
+          workspace = '/workspace',
+          checkpoint = checkpoint 
+        )
+
diff --git a/tool/dlaas/model.py b/tool/dlaas/model.py
new file mode 100644
index 0000000000..8030a88844
--- /dev/null
+++ b/tool/dlaas/model.py
@@ -0,0 +1,34 @@
+import os, sys
+import numpy as np
+
+current_path_ = os.path.dirname(__file__)
+singa_root_="/usr/src/incubator-singa"
+sys.path.append(os.path.join(singa_root_,'tool','python'))
+
+from singa.driver import Driver
+from singa.layer import *
+from singa.model import *
+from singa.utils.utility import swap32
+
+
+imageData_0=ImageData(shape=[50000,3,32,32],data_path="/workspace/data/train.bin",data_type="byte",mean_path="/workspace/data/mean.bin",mean_type="float")
+labelData_0=LabelData(shape=[50000,1],label_path="/workspace/data/train.label.bin",label_type="int")
+convolution_0=Convolution2D(32,5,1,2,w_std=0.0001, b_lr=2,src=[imageData_0])
+pooling_0=MaxPooling2D(pool_size=(3,3), stride=2,src=[convolution_0])
+activation_0=Activation("relu",src=[pooling_0])
+lrn_0=LRN2D(3, alpha=0.00005, beta=0.75,src=[activation_0])
+convolution_1=Convolution2D(32,5,1,2, b_lr=2,src=[lrn_0])
+activation_1=Activation("relu",src=[convolution_1])
+pooling_1=AvgPooling2D(pool_size=(3,3), stride=2,src=[activation_1])
+lrn_1=LRN2D(3, alpha=0.00005, beta=0.75,src=[pooling_1])
+convolution_2=Convolution2D(64,5,1,2, b_lr=2,src=[lrn_1])
+activation_2=Activation("relu",src=[convolution_2])
+pooling_2=AvgPooling2D(pool_size=(3,3), stride=2,src=[activation_2])
+dense_0=Dense(10, w_wd=250, b_lr=2, b_wd=0,src=[pooling_2])
+loss_0=Loss("softmaxloss",src=[dense_0,labelData_0])
+neuralnet = [imageData_0,labelData_0,convolution_0,pooling_0,activation_0,lrn_0,convolution_1,activation_1,pooling_1,lrn_1,convolution_2,activation_2,pooling_2,dense_0,loss_0]
+
+
+
+#algorithm
+updater = SGD(decay=0.004, momentum=0.9, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
\ No newline at end of file
diff --git a/tool/dlaas/test.sh b/tool/dlaas/test.sh
new file mode 100644
index 0000000000..d285df9d53
--- /dev/null
+++ b/tool/dlaas/test.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+cd /workspace
+wget $1
+tar zxf *.tar.gz
+cp /workspace/model.py /opt/incubator-singa/tool/dlaas/
+cd /opt/incubator-singa/
+python tool/dlaas/main.py test $2 $3 
diff --git a/tool/dlaas/train.sh b/tool/dlaas/train.sh
new file mode 100644
index 0000000000..66cd8d84e6
--- /dev/null
+++ b/tool/dlaas/train.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+cd /workspace
+wget $1
+tar zxf *.tar.gz
+cp /workspace/model.py /opt/incubator-singa/tool/dlaas/
+cd /opt/incubator-singa/
+python tool/dlaas/main.py train $2 $3 $4 $5 $6
diff --git a/tool/docker/ubuntu-14.04/Dockerfile b/tool/docker/ubuntu-14.04/Dockerfile
new file mode 100644
index 0000000000..67e4662334
--- /dev/null
+++ b/tool/docker/ubuntu-14.04/Dockerfile
@@ -0,0 +1,30 @@
+FROM ubuntu:14.04
+MAINTAINER Aaron WWF<dcswuw@gmail.com>
+
+
+ENV PREFIX=/usr/local
+ENV LIBRARY_PATH=/opt/OpenBLAS/lib:$PREFIX/lib:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$PREFIX/lib:$LD_LIBRARY_PATH
+ENV CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$PREFIX/include:$CPLUS_INCLUDE_PATH
+ENV SINGA_HOME=/opt/incubator-singa
+ENV PATH=/opt/OpenBLAS/bin:$SINGA_HOME/bin:$PREFIX/bin:$PATH
+
+RUN apt-get update && apt-get install -y git unzip make autotools-dev \
+    automake autoconf wget gcc g++ libtool python2.7 python2.7-dev \
+    build-essential swig python-pip libtiff5-dev libjpeg8-dev zlib1g-dev \
+    libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk
+
+RUN pip install Image flask numpy
+
+#RUN ln -s /usr/bin/python2.7 /usr/bin/python
+
+#use /bin/bash instead of /bin/sh
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+
+COPY . $SINGA_HOME/
+
+RUN cd $SINGA_HOME && ./thirdparty/install.sh all \
+    && cd ./tool/python/singa && ./generatepy.sh \
+    && cd $SINGA_HOME && ./autogen.sh && ./configure --enable-python \
+    && make
+
diff --git a/tool/docker/ubuntu-14.04/build.sh b/tool/docker/ubuntu-14.04/build.sh
new file mode 100644
index 0000000000..efd622eb61
--- /dev/null
+++ b/tool/docker/ubuntu-14.04/build.sh
@@ -0,0 +1,4 @@
+cd ../../..
+cp tool/docker/ubuntu-14.04/Dockerfile .
+docker build -t singa .
+
diff --git a/tool/python/singa/driver.i b/tool/python/singa/driver.i
index 63f2287933..f28414e208 100644
--- a/tool/python/singa/driver.i
+++ b/tool/python/singa/driver.i
@@ -66,14 +66,14 @@ namespace singa{
       void InitNetParams(const std::string& folder, std::vector<singa::Layer*> net);
       void Checkpoint(int step, const std::string& folder, std::vector<singa::Layer*> net);
   };
-    
   class DummyLayer{
     public:
+      /* void Setup(const singa::LayerProto& proto, const std::vector<singa::Layer*>& srclayers);
+      */
       void Setup(const std::string str, const std::vector<singa::Layer*>& srclayers);
-      void Feed(int batchsize, std::vector<float>& data, std::vector<int>& aux_data);
+      void Feed(std::vector<int> shape, std::vector<float>* data, int op);
       singa::Layer* ToLayer();
-  };
-
+  }; 
   %nodefault Layer;
   class Layer{
     public:
diff --git a/tool/python/singa/layer.py b/tool/python/singa/layer.py
index c9a992d765..8a0750ab45 100644
--- a/tool/python/singa/layer.py
+++ b/tool/python/singa/layer.py
@@ -25,7 +25,7 @@
 This script includes Layer class and its subclasses that
 users can configure different types of layers for their model.
 '''
-import numpy as np
+import numpy
 from singa.parameter import Parameter, set_param_field
 from singa.initializations import get_init_values
 from singa.utils.utility import setval, generate_name
@@ -54,7 +54,8 @@ def __init__(self, **kwargs):
         # layer connectivity is set in Model.build()
         self.is_datalayer = False
         self.singalayer = None
-        self.srclayers = []
+        self.srclayers = [] 
+
 
         # set src for Rafiki
         if 'src' in kwargs:
@@ -63,19 +64,15 @@ def __init__(self, **kwargs):
             self.src = None
 
     def setup(self, srclys):
-        ''' Create singa::Layer and store srclayers
-        '''
+        # create singa::Layer and store srclayers 
         if self.singalayer == None:
-            self.singalayer = SingaLayer.CreateLayer(
-                                    self.layer.SerializeToString())
-            self.singaSrclayerVector = layerVector(len(srclys))
+            self.singalayer = SingaLayer.CreateLayer(self.layer.SerializeToString())  
+            self.singaSrclayerVector = layerVector(len(srclys)) 
             for i in range(len(srclys)):
                 self.srclayers.append(srclys[i])
                 self.singaSrclayerVector[i] = srclys[i].get_singalayer()
             # set up the layer
-            SingaLayer.SetupLayer(self.singalayer,
-                                  self.layer.SerializeToString(),
-                                  self.singaSrclayerVector)
+            SingaLayer.SetupLayer(self.singalayer, self.layer.SerializeToString(), self.singaSrclayerVector)
 
     def ComputeFeature(self, *srclys):
         ''' The method creates and sets up singa::Layer
@@ -84,38 +81,48 @@ def ComputeFeature(self, *srclys):
 
             *srclys = (list)  // a list of source layers
         '''
-        # create singa::Layer and store srclayers
+
+        # create singa::Layer and store srclayers 
         if self.singalayer == None:
             if self.src != None:
                 srclys = self.src
-            self.singalayer = SingaLayer.CreateLayer(
-                                    self.layer.SerializeToString())
-            self.singaSrclayerVector = layerVector(len(srclys))
+            self.singalayer = SingaLayer.CreateLayer(self.layer.SerializeToString())  
+            self.singaSrclayerVector = layerVector(len(srclys)) 
             for i in range(len(srclys)):
                 self.srclayers.append(srclys[i])
                 self.singaSrclayerVector[i] = srclys[i].get_singalayer()
             # set up the layer
-            SingaLayer.SetupLayer(self.singalayer,
-                                  self.layer.SerializeToString(),
-                                  self.singaSrclayerVector)
+            SingaLayer.SetupLayer(self.singalayer, self.layer.SerializeToString(), self.singaSrclayerVector)
 
         self.singalayer.ComputeFeature(1, self.singaSrclayerVector)
 
-    def ComputeGradient(self):
+    def ComputeGradient(self, step, upd=None):
         ''' The method creates singa::Updater
             and calls ComputeGradient for gradient computation
             then updates the parameters.
+
+            step = (int)    // a training step
+            upd = (object)  // Updater object
         '''
+
+        # create singa::Updater
+        assert upd != None, 'required Updater (see model.py)' 
+        if Layer.singaupdater == None:
+            Layer.singaupdater = SingaUpdater.CreateUpdater(upd.proto.SerializeToString()) 
+
         # call ComputeGradient of Singa
         self.singalayer.ComputeGradient(1, self.singaSrclayerVector)
 
-    def UpdateParams(self, step, upd):
-        ''' The method updates parameter values
-        '''
         # update parameters
         singaParams = self.singalayer.GetParams()
-        for par in singaParams:
-            upd.singaupdater.Update(step, par, 1.0)
+        for p in singaParams:
+            Layer.singaupdater.Update(step, p, 1.0)
+
+        # recursively call ComputeGradient of srclayers
+        #(TODO) what if there are multiple source layers???
+        for sly in self.srclayers:
+            if sly.srclayers != None:
+                sly.ComputeGradient(step, upd) 
 
     def GetParams(self):
         ''' The method gets parameter values
@@ -126,12 +133,12 @@ def GetParams(self):
         assert len(singaParams) == 2, 'weight and bias'
         # for weight
         weight_array = floatArray_frompointer(singaParams[0].mutable_cpu_data())
-        weight = [weight_array[i] for i in range(singaParams[0].size())]
-        weight = np.array(weight).reshape(singaParams[0].shape())
+        weight = [ weight_array[i] for i in range(singaParams[0].size()) ]
+        weight = numpy.array(weight).reshape(singaParams[0].shape())
         # for bias
         bias_array = floatArray_frompointer(singaParams[1].mutable_cpu_data())
-        bias = [bias_array[i] for i in range(singaParams[1].size())]
-        bias = np.array(bias).reshape(singaParams[1].shape()[0], 1)
+        bias = [ bias_array[i] for i in range(singaParams[1].size()) ]
+        bias = numpy.array(bias).reshape(singaParams[1].shape()[0], 1)
 
         return weight, bias
 
@@ -148,19 +155,17 @@ def SetParams(self, *params):
             bp.shape.append(int(params[k].shape[1]))
             for i in range(params[k].shape[0]):
                 for j in range(params[k].shape[1]):
-                    bp.data.append(params[k][i, j])
+                    bp.data.append(params[k][i,j])
             singaParams[k].FromProto(bp.SerializeToString())
 
     def GetData(self):
-        ''' The method gets layer data values
-        '''
         blobptr = self.singalayer.data(self.singalayer)
         data_array = floatArray_frompointer(blobptr.mutable_cpu_data())
-        data = [data_array[i] for i in range(blobptr.count())]
+        data = [ data_array[i] for i in range(blobptr.count()) ]
         return data
 
     def display(self):
-        debug, flag = False, 0
+        debug, flag = 0, 0
         print self.singalayer.ToString(debug, flag)
 
     def get_singalayer(self):
@@ -169,18 +174,44 @@ def get_singalayer(self):
 
 class Dummy(object):
 
-    def __init__(self, **kwargs):
-        ''' Dummy layer is used for data layer to feed/fetch input data
-            or label information
+    def __init__(self, shape=[], path='', dtype='', src=[]):
+        ''' Dummy layer is used for data layer
+            shape = (list)   // [# of samples, # of channels, img h, img w]
+            path  = (string) // path to dataset
         '''
         self.is_datalayer = True
-        self.srclayers = None
+        self.srclayers = None 
         self.singalayer = None
 
         # create layer proto for Dummy layer
         kwargs = {'name':'dummy', 'type':kDummy}
         self.layer = Message('Layer', **kwargs).proto
 
+
+        # if dataset path is not specified, skip
+        # otherwise, load dataset
+        if path == '':
+            return
+
+        self.shape = shape
+        self.path = path
+        self.src = None
+        self.batch_index = 0
+
+        import numpy as np
+        nb_samples = shape[0]
+        nb_pixels = shape[1]
+        for i in range(len(shape)-2):
+            nb_pixels *= shape[i+2]  
+        if dtype=='byte': 
+            self.is_label = 0
+            d = np.fromfile(path, dtype=np.uint8)
+        elif dtype=='int': 
+            self.is_label = 1
+            d = np.fromfile(path, dtype=np.int)
+        self.data = d.reshape(nb_samples, nb_pixels)
+
+
     def setup(self, data_shape):
         ''' Create and Setup singa Dummy layer
             called by load_model_parameter
@@ -189,67 +220,216 @@ def setup(self, data_shape):
             setval(self.layer.dummy_conf, input=True)
             setval(self.layer.dummy_conf, shape=data_shape)
             self.singalayer = DummyLayer()
-            self.singalayer.Setup(self.layer.SerializeToString(),
-                                  layerVector(0))
+            self.singalayer.Setup(self.layer.SerializeToString(), layerVector(0))
+
+
+    def FetchData(self, batchsize):
+
+        d = self.data[self.batch_index*batchsize:(self.batch_index+1)*batchsize, :]
+        self.Feed(d, self.shape[1], self.is_label)
+        self.batch_index += 1
 
-    def Feed(self, shape, data, aux_data):
+
+    def Feed(self, data, nb_channel=1, is_label=0):
         ''' Create and Setup singa::DummyLayer for input data
             Insert data using Feed()
         '''
-        batchsize = shape[0]
-        hdim = reduce(lambda x, y: x*y, shape[1:])
+
+        batchsize, hdim = data.shape
         datasize = batchsize * hdim
+        imgsize = int(numpy.sqrt(hdim/nb_channel)) 
+        shapeVector = [batchsize, nb_channel, imgsize, imgsize] 
 
         # create and setup the dummy layer
         if self.singalayer == None:
-            self.setup(shape)
+            setval(self.layer.dummy_conf, input=True)
+            setval(self.layer.dummy_conf, shape=shapeVector)
+            self.singalayer = DummyLayer()
+            self.singalayer.Setup(self.layer.SerializeToString(), layerVector(0))
+
+        # feed input data
+        data = data.astype(numpy.float) 
+        dataVector = floatVector(datasize)
+        k = 0
+        for i in range(batchsize):
+            for j in range(hdim):
+                dataVector[k] = data[i,j]
+                k += 1
+        self.singalayer.Feed(shapeVector, dataVector, is_label)
+
+    def get_singalayer(self):
+        return self.singalayer.ToLayer()
+
+class ImageData(object):
+
+    def __init__(self, shape=[], data_path='', data_type='byte',mean_path='',mean_type='float'):
+        ''' Dummy layer is used for data layer
+            shape = (list)   // [# of samples, # of channels, img h, img w]
+            data_path  = (string) // path to dataset
+            mean_path
+        '''
+        self.is_datalayer = True
+        self.srclayers = None 
+        self.singalayer = None
+        self.is_label = False 
+        # create layer proto for Dummy layer
+        kwargs = {'name':'dummy', 'type':kDummy}
+        self.layer = Message('Layer', **kwargs).proto
+
+        # if dataset path is not specified, skip
+        # otherwise, load dataset
+        if data_path == '' or mean_path=='':
+            return
+
+        self.shape = shape
+        self.data_path = data_path
+        self.mean_path = mean_path
+        self.src = None
+        self.batch_index = 0
+
+        import numpy as np
+        nb_samples = shape[0]
+        nb_pixels = shape[1]
+        for i in range(len(shape)-2):
+            nb_pixels *= shape[i+2]  
+
+        if data_type=='byte': 
+            d = np.fromfile(data_path, dtype=np.uint8)
+        elif data_type=='int': 
+            d = np.fromfile(data_path, dtype=np.int)
+        self.data = d.reshape(nb_samples, nb_pixels)
+
+        if mean_type=='float': 
+            d = np.fromfile(mean_path, dtype=np.float32)
+        self.mean = d.reshape(1, nb_pixels)
+
+    def setup(self, data_shape):
+        ''' Create and Setup singa Dummy layer
+            called by load_model_parameter
+        '''
+        if self.singalayer == None:
+            setval(self.layer.dummy_conf, input=True)
+            setval(self.layer.dummy_conf, shape=data_shape)
+            self.singalayer = DummyLayer()
+            self.singalayer.Setup(self.layer.SerializeToString(), layerVector(0))
+
 
-        if data is not None:
-            data = data.astype(np.float)
-            dataVector = floatVector(datasize)
-            for i in range(batchsize):
-                for j in range(hdim):
-                    dataVector[i*hdim+j] = data[i, j]
-            labelVector = intVector(0)
+    def FetchData(self, batchsize):
 
-        if aux_data is not None:
-            aux_data = aux_data.astype(np.int)
-            labelVector = intVector(datasize)
-            for i in range(batchsize):
-                labelVector[i] = aux_data[i, 0]
-            dataVector = floatVector(0)
+        d = self.data[self.batch_index*batchsize:(self.batch_index+1)*batchsize, :]
+        self.Feed(d, self.shape[1])
+        self.batch_index += 1
+        if (self.batch_index+1)*batchsize>self.data.shape[0]:
+            self.batch_index=0
 
-        self.singalayer.Feed(batchsize, dataVector, labelVector)
+
+
+    def Feed(self, data, nb_channel=1):
+        ''' Create and Setup singa::DummyLayer for input data
+            Insert data using Feed()
+            Need to minus the mean file
+        '''
+        batchsize, hdim = data.shape
+        datasize = batchsize * hdim
+        imgsize = int(numpy.sqrt(hdim/nb_channel)) 
+        shapeVector = [batchsize, nb_channel, imgsize, imgsize] 
+        #print shapeVector
+        # create and setup the dummy layer
+        if self.singalayer == None:
+            setval(self.layer.dummy_conf, input=True)
+            setval(self.layer.dummy_conf, shape=shapeVector)
+            self.singalayer = DummyLayer()
+            self.singalayer.Setup(self.layer.SerializeToString(), layerVector(0))
+
+        # feed input data and minus mean 
+        data = data.astype(numpy.float) 
+        dataVector = floatVector(datasize)
+        k = 0
+        for i in range(batchsize):
+            for j in range(hdim):
+                dataVector[k] = data[i,j]-self.mean[0,j]
+                k += 1
+        self.singalayer.Feed(shapeVector, dataVector, 0)
 
     def get_singalayer(self):
         return self.singalayer.ToLayer()
 
-class ImageInput(Dummy):
-    ''' This class is used to feed image data
-    '''
-    def __init__(self, width=None, height=None, nb_channel=1):
-        super(ImageInput, self).__init__()
-        self.width = width
-        self.height = height
-        self.nb_channel = nb_channel
-
-    def Feed(self, image_data):
-        batchsize = image_data.shape[0]
-        if self.width == None or self.height == None:
-            hdim = image_data.shape[1]
-            imgsize = int(np.sqrt(hdim/self.nb_channel))
-        shape = [batchsize, self.nb_channel, self.width, self.height]
-        Dummy.Feed(self, shape, image_data, None)
-
-class LabelInput(Dummy):
-    ''' This class is used to feed label data
-    '''
-    def __init__(self):
-        super(LabelInput, self).__init__()
-
-    def Feed(self, label_data):
-        Dummy.Feed(self, label_data.shape, None, label_data)
 
+class LabelData(object):
+
+    def __init__(self, shape=[], label_path='', label_type='int'):
+        ''' Dummy layer is used for label data layer
+            shape = (list)   // [# of samples, # of channels, img h, img w]
+            data_path  = (string) // path to dataset
+            mean_path
+        '''
+        self.is_datalayer = True
+        self.srclayers = None 
+        self.singalayer = None
+        self.is_label = True
+        # create layer proto for Dummy layer
+        kwargs = {'name':'dummy', 'type':kDummy}
+        self.layer = Message('Layer', **kwargs).proto
+
+        # if dataset path is not specified, skip
+        # otherwise, load dataset
+        if label_path == '':
+            return
+
+        self.shape = shape
+        self.label_path = label_path
+        self.src = None
+        self.batch_index = 0
+
+        import numpy as np
+        nb_samples = shape[0]
+
+        if label_type=='int': 
+            d = np.fromfile(label_path, dtype=np.int)
+        self.data = d.reshape(nb_samples, 1)
+
+    def setup(self, data_shape):
+        ''' Create and Setup singa Dummy layer
+            called by load_model_parameter
+        '''
+        if self.singalayer == None:
+            setval(self.layer.dummy_conf, input=True)
+            setval(self.layer.dummy_conf, shape=data_shape)
+            self.singalayer = DummyLayer()
+            self.singalayer.Setup(self.layer.SerializeToString(), layerVector(0))
+
+
+    def FetchData(self, batchsize):
+
+        d = self.data[self.batch_index*batchsize:(self.batch_index+1)*batchsize, :]
+        self.Feed(d, self.shape[1])
+        self.batch_index += 1
+        if (self.batch_index+1)*batchsize>self.data.shape[0]:
+            self.batch_index=0
+
+    def Feed(self, data,nb_chanel=1):
+        ''' Create and Setup singa::DummyLayer for input data
+            Insert data using Feed()
+            Need to minus the mean file
+        '''
+        batchsize = data.shape[0]
+        shapeVector = [batchsize, 1] 
+
+        # create and setup the dummy layer
+        if self.singalayer == None:
+            setval(self.layer.dummy_conf, input=True)
+            setval(self.layer.dummy_conf, shape=shapeVector)
+            self.singalayer = DummyLayer()
+            self.singalayer.Setup(self.layer.SerializeToString(), layerVector(0))
+
+        data = data.astype(numpy.float) 
+        dataVector = floatVector(batchsize)
+        for i in range(batchsize):
+            dataVector[i] = data[i,0]
+        self.singalayer.Feed(shapeVector, dataVector, 1)
+
+    def get_singalayer(self):
+        return self.singalayer.ToLayer()
 
 class Data(Layer):
 
@@ -311,7 +491,7 @@ def __init__(self, nb_filter=0, kernel=0, stride=1, pad=0,
                            // scale the learning rate when updating parameters.
             w_wd = (float) // weight decay multiplier for weight, used to
                            // scale the weight decay when updating parameters.
-            b_lr = (float) // learning rate multiplier for bias
+            b_lr = (float) // learning rate multiplier for bias 
             b_wd = (float) // weight decay multiplier for bias
         '''
 
@@ -321,22 +501,22 @@ def __init__(self, nb_filter=0, kernel=0, stride=1, pad=0,
         fields = {"num_filters":nb_filter}
         # for kernel
         if type(kernel) == int:
-            fields['kernel'] = kernel
+          fields['kernel'] = kernel
         else:
-            fields['kernel_x'] = kernel[0]
-            fields['kernel_y'] = kernel[1]
-        # for stride
+          fields['kernel_x'] = kernel[0]
+          fields['kernel_y'] = kernel[1]
+        # for stride 
         if type(stride) == int:
-            fields['stride'] = stride
+          fields['stride'] = stride
         else:
-            fields['stride_x'] = stride[0]
-            fields['stride_y'] = stride[1]
-        # for pad
+          fields['stride_x'] = stride[0]
+          fields['stride_y'] = stride[1]
+        # for pad 
         if type(pad) == int:
-            fields['pad'] = pad
+          fields['pad'] = pad 
         else:
-            fields['pad_x'] = pad[0]
-            fields['pad_y'] = pad[1]
+          fields['pad_x'] = pad[0]
+          fields['pad_y'] = pad[1]
 
         setval(self.layer.convolution_conf, **fields)
 
@@ -381,7 +561,7 @@ def __init__(self, pool_size=None,
                'currently pool size should be square in Singa'
         super(MaxPooling2D, self).__init__(name=generate_name('pool'),
                                            type=kCPooling, **kwargs)
-        fields = {'pool' : PoolingProto().MAX,
+        fields = {'pool' : MAX,
                   'kernel' : pool_size[0],
                   'stride' : stride,
                   'pad' : 0 if ignore_border else 1}
@@ -407,8 +587,8 @@ def __init__(self, pool_size=None,
                'currently pool size should be square in Singa'
         super(AvgPooling2D, self).__init__(name=generate_name('pool'),
                                            type=kCPooling, **kwargs)
-        self.layer.pooling_conf.pool = PoolingProto().AVG
-        fields = {'pool' : PoolingProto().AVG,
+        self.layer.pooling_conf.pool = AVG
+        fields = {'pool' : AVG,
                   'kernel' : pool_size[0],
                   'stride' : stride,
                   'pad' : 0 if ignore_border else 1}
@@ -450,7 +630,7 @@ def __init__(self, activation='stanh', **kwargs):
           activation = (string) // relu, sigmoid, tanh, stanh, softmax.
         '''
         if activation == 'tanh':
-            print 'Warning: Tanh layer is not supported for CPU'
+          print 'Warning: Tanh layer is not supported for CPU'
 
         self.name = activation
         self.layer_type = kActivation
@@ -468,6 +648,7 @@ def __init__(self, activation='stanh', **kwargs):
             self.layer.activation_conf.type = TANH # for GPU
         #elif activation == 'stanh':
         #    self.layer.activation_conf.type = STANH
+        
 
 
 class Dropout(Layer):
@@ -528,7 +709,7 @@ def __init__(self, output_dim=0, activation=None,
                            // scale the learning rate when updating parameters.
             w_wd = (float) // weight decay multiplier for weight, used to
                            // scale the weight decay when updating parameters.
-            b_lr = (float) // learning rate multiplier for bias
+            b_lr = (float) // learning rate multiplier for bias 
             b_wd = (float) // weight decay multiplier for bias
         '''
         # required
@@ -570,7 +751,7 @@ def __init__(self, hid_dim=None, out_dim=0,
         required
           hid_dim     = (int/list) // the number of nodes in hidden layers
           out_dim     = (int)      // the number of nodes in the top layer
-        optional
+        optional 
           activation  = (string)
           param_share = (bool)     // to share params in encoder and decoder
         '''
@@ -609,8 +790,7 @@ def __init__(self, out_dim=None, w_param=None, b_param=None,
         self.name = kwargs['name'] if 'name' in kwargs else 'RBMVis'
         self.layer_type = kwargs['type'] if 'type' in kwargs else kRBMVis
         super(RBM, self).__init__(name=generate_name(self.name,
-                                                     withnumber=False),
-                                  type=self.layer_type, **kwargs)
+                                  withnumber=False), type=self.layer_type, **kwargs)
         setval(self.layer.rbm_conf, hdim=self.out_dim[-1])
         if self.layer_type == kRBMHid and sampling != None:
             if sampling == 'gaussian':
diff --git a/tool/python/singa/utils/imgtool.py b/tool/python/singa/utils/imgtool.py
new file mode 100644
index 0000000000..683e3d3954
--- /dev/null
+++ b/tool/python/singa/utils/imgtool.py
@@ -0,0 +1,345 @@
+'''
+Created on Jan 8, 2016
+@author: aaron
+'''
+from PIL import Image
+import sys, glob, os, random, shutil, time, struct
+from . import kvstore
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../pb2'))
+from common_pb2 import RecordProto
+
+#bytearray to image object
+def toImg(byteArray,size):
+    img = Image.new("RGB",size)
+    pix = img.load()
+    area = size[0]*size[1]
+    red = byteArray[:area]
+    green = byteArray[area:area*2]
+    blue = byteArray[area*2:]
+    index=0
+    for x in range(0,size[0]):
+        for y in range(0,size[1]):
+            img.putpixel((x,y), (red[index],green[index],blue[index]))     
+            index+=1
+    return img
+
+# image object to bytearray
+def toBin(im,size):
+    red = []
+    green = []
+    blue = []
+    pix = im.load()
+    for x in range(0,size[0]):
+        for y in range(0,size[1]):
+            pixel = pix[x,y]
+            red.append(pixel[0])
+            green.append(pixel[1])
+            blue.append(pixel[2])         
+    fileByteArray = bytearray(red+green+blue)
+    return fileByteArray
+
+def resize_to_center(im,size):
+    oldSize = im.size
+    #bigest center cube
+    data=(0,0,0,0)
+    if oldSize[0] < oldSize[1]:
+        data= (0,(oldSize[1]-oldSize[0])/2,oldSize[0],(oldSize[1]+oldSize[0])/2)
+    else :
+        data= ((oldSize[0]-oldSize[1])/2,0,(oldSize[0]+oldSize[1])/2,oldSize[1])
+    newIm = im.transform(size,Image.EXTENT,data)
+    return newIm
+#transfer, resize img. only deal with .jpg file
+def transform_img(
+            input_folder,
+            output_folder, 
+            size         
+                     ):
+    print "Transfer images begin at:"+time.strftime('%X %x %Z')
+
+    #if output_folder exists, empty it, otherwise create a dir
+    try:
+        os.stat(output_folder)
+        for root, dirs, files in os.walk(output_folder):
+            for f in files:
+                os.unlink(os.path.join(root, f))
+            for d in dirs:
+                shutil.rmtree(os.path.join(root, d))
+    except:
+        os.makedirs(output_folder)
+
+    count=0
+    for root, dirs, files in os.walk(input_folder):
+        for d in dirs:
+            print "find dir:", d
+            os.makedirs(os.path.join(output_folder,d))
+            for infile in glob.glob(os.path.join(input_folder,d,"*.jpg")):
+                fileName = os.path.split(infile)[-1]
+                name,ext = os.path.splitext(fileName) 
+                im = Image.open(infile).convert("RGB")
+                newIm=resize_to_center(im,size)
+                newIm.save(os.path.join(output_folder,d,name+".center.jpg"), "JPEG")
+                count+=1
+
+    print "transfer end at:"+time.strftime('%X %x %Z')
+    print "total file number: ", count
+
+    return count
+
+               
+
+def generate_bin_data(
+                input_folder,
+                output_folder,
+                size ,     
+                train_num,
+                test_num,
+                validate_num,
+                meta_file_name="meta.txt",
+                train_bin_file_name="train.bin",
+                train_label_bin_file_name="train.label.bin",
+                test_bin_file_name="test.bin",
+                test_label_bin_file_name="test.label.bin",
+                validate_bin_file_name="validate.bin",
+                validate_label_bin_file_name="validate.label.bin",
+                mean_bin_file_name="mean.bin",
+                label_bin_file_name="label.bin",
+
+                      ):
+    try:
+        os.stat(output_folder)
+    except:
+        os.makedirs(output_folder)
+
+    print "Generate bin start at: "+time.strftime('%X %x %Z') 
+    meta_file = open(os.path.join(output_folder,meta_file_name), "w")
+
+    fileList=[]
+    labelList= []
+    label=0 #label begin from 1
+
+    #get all img file, the folder name is the label name
+    for d in os.listdir(input_folder):    
+        if os.path.isdir(os.path.join(input_folder,d)):
+            labelList.append((label,d))
+            for f in glob.glob(os.path.join(input_folder,d,"*.jpg")):
+                fileList.append((label,f))
+            label += 1
+
+    # disorder all the files
+    random.shuffle(fileList)
+
+    total = len(fileList)
+    print total,train_num,test_num,validate_num
+    assert total >= train_num+test_num+validate_num
+
+    train_file = open(os.path.join(output_folder,train_bin_file_name),"wb")    
+    train_label_file = open(os.path.join(output_folder,train_label_bin_file_name),"wb")    
+    validate_file = open(os.path.join(output_folder,validate_bin_file_name),"wb")    
+    validate_label_file = open(os.path.join(output_folder,validate_label_bin_file_name),"wb")    
+    test_file = open(os.path.join(output_folder,test_bin_file_name),"wb")    
+    test_label_file = open(os.path.join(output_folder,test_label_bin_file_name),"wb")    
+    mean_file = open(os.path.join(output_folder,mean_bin_file_name),"wb")    
+ 
+    count=0
+    trainCount=0
+    validateCount=0
+    testCount=0
+
+    # the expected image binary length
+    binaryLength=3*size[0]*size[1] 
+
+    meanData=[]
+    for i in range(0,binaryLength):
+        meanData.append(0.0)
+
+    #calculate mean
+    for (label,f) in fileList:    
+
+        count+=1
+        im =Image.open(f)
+        #the image size should be equal
+        assert im.size==size
+        binaryPixel=toBin(im,size)
+        if count <= train_num :
+            trainCount+=1
+            train_file.write(binaryPixel) 
+            train_label_file.write(kvstore.i2b(label))
+            #only caculate train data's mean value
+            for i in range(binaryLength):
+                meanData[i]+=binaryPixel[i]
+        elif count <= train_num+validate_num :
+            validateCount+=1
+            validate_label_file.write(kvstore.i2b(label))
+            validate_file.write(binaryPixel) 
+        elif count <= train_num+validate_num+test_num:
+            testCount+=1
+            test_label_file.write(kvstore.i2b(label))
+            test_file.write(binaryPixel) 
+	else:
+            break
+
+    for i in range(binaryLength):
+        meanData[i]/=trainCount
+
+    meanBinary=struct.pack("%sf" % binaryLength, *meanData)
+
+    mean_file.write(meanBinary)
+    mean_file.flush()
+    mean_file.close()
+     
+    train_file.flush()
+    train_file.close()
+    validate_file.flush()
+    validate_file.close()
+    test_file.flush()
+    test_file.close()
+
+    meta_file.write("image size: "+str(size[0])+"*"+str(size[1])+"\n")    
+    meta_file.write("total file num: "+str(count)+"\n")    
+    meta_file.write("train file num: "+str(trainCount)+"\n")
+    meta_file.write("validate file num: "+str(validateCount)+"\n")
+    meta_file.write("test file num: "+str(testCount)+"\n")
+    meta_file.write("label list:[\n")
+
+    for item in labelList:
+        meta_file.write("("+str(item[0])+",\""+item[1]+"\"),\n")
+    meta_file.write("]")
+    meta_file.flush()
+    meta_file.close()
+
+    print "end at: "+time.strftime('%X %x %Z')    
+
+    return labelList
+
+
+def generate_kvrecord_data(
+                input_folder,
+                output_folder,
+                size ,     
+                train_num,
+                test_num,
+                validate_num,
+                meta_file_name="meta.txt",
+                train_bin_file_name="train.bin",
+                test_bin_file_name="test.bin",
+                validate_bin_file_name="validate.bin",
+                mean_bin_file_name="mean.bin",
+
+                      ):
+    try:
+        os.stat(output_folder)
+    except:
+        os.makedirs(output_folder)
+
+    print "Generate kvrecord start at: "+time.strftime('%X %x %Z') 
+    meta_file = open(os.path.join(output_folder,meta_file_name), "w")
+
+    fileList=[]
+    labelList= []
+    label=0 #label begin from 1
+
+    #get all img file, the folder name is the label name
+    for d in os.listdir(input_folder):    
+        if os.path.isdir(os.path.join(input_folder,d)):
+            labelList.append((label,d))
+            for f in glob.glob(os.path.join(input_folder,d,"*.jpg")):
+                fileList.append((label,f))
+            label += 1
+
+    # disorder all the files
+    random.shuffle(fileList)
+
+    total = len(fileList)
+    print total,train_num,test_num,validate_num
+    assert total >= train_num+test_num+validate_num
+
+
+    trainStore = kvstore.FileStore()
+    trainStore.open(os.path.join(output_folder,train_bin_file_name), "create")
+    validateStore = kvstore.FileStore()
+    validateStore.open(os.path.join(output_folder,validate_bin_file_name), "create")
+    testStore = kvstore.FileStore()
+    testStore.open(os.path.join(output_folder,test_bin_file_name), "create")
+    
+    meanStore = kvstore.FileStore()
+    meanStore.open(os.path.join(output_folder,mean_bin_file_name), "create")
+    
+ 
+    count=0
+    trainCount=0
+    validateCount=0
+    testCount=0
+
+    # the expected image binary length
+    binaryLength=3*size[0]*size[1] 
+
+    meanRecord = RecordProto()
+    meanRecord.shape.extend([3,size[0],size[1]])
+    for i in range(binaryLength):
+        meanRecord.data.append(0.0)
+
+    for (label,f) in fileList:    
+
+        im =Image.open(f)
+        #the image size should be equal
+        assert im.size==size
+
+        binaryContent=str(toBin(im,size))
+
+        count +=1
+        record = RecordProto()
+        record.shape.extend([3,size[0],size[1]])
+        record.label=label
+        record.pixel=binaryContent
+
+        value = record.SerializeToString()
+    
+        if count <= train_num :
+            key = "%05d" % trainCount 
+            trainCount+=1
+            trainStore.write(key,value) 
+            #only caculate train data's mean
+            for i in range(binaryLength):
+                meanRecord.data[i]+=ord(binaryContent[i])
+        elif count <= train_num+validate_num :
+            key = "%05d" % validateCount 
+            validateCount+=1
+            validateStore.write(key,value) 
+        elif count <= train_num+validate_num+test_num:
+            key = "%05d" % testCount 
+            testCount+=1
+            testStore.write(key,value) 
+        else:
+            break
+
+    for i in range(binaryLength):
+        meanRecord.data[i]/=trainCount
+
+    meanStore.write("mean", meanRecord.SerializeToString())
+    meanStore.flush()
+    meanStore.close()
+     
+    trainStore.flush()
+    trainStore.close()
+    validateStore.flush()
+    validateStore.close()
+    testStore.flush()
+    testStore.close()
+
+    meta_file.write("image size: "+str(size[0])+"*"+str(size[1])+"\n")    
+    meta_file.write("total file num: "+str(count)+"\n")    
+    meta_file.write("train file num: "+str(trainCount)+"\n")
+    meta_file.write("validate file num: "+str(validateCount)+"\n")
+    meta_file.write("test file num: "+str(testCount)+"\n")
+    meta_file.write("label list:[\n")
+
+    for item in labelList:
+        meta_file.write("("+str(item[0])+",\""+item[1]+"\"),\n")
+    meta_file.write("]")
+    meta_file.flush()
+    meta_file.close()
+
+    print "end at: "+time.strftime('%X %x %Z')    
+
+    return labelList
diff --git a/tool/python/singa/utils/kvstore.py b/tool/python/singa/utils/kvstore.py
new file mode 100644
index 0000000000..7fe16e019d
--- /dev/null
+++ b/tool/python/singa/utils/kvstore.py
@@ -0,0 +1,90 @@
+'''
+Created on Jan 8, 2016
+
+@author: aaron
+'''
+import struct, os 
+
+INT_LEN=8
+
+class FileStore():
+    '''
+    kv file store
+    '''
+    def open(self,src_path,mode):
+        if mode == "create":
+            self._file = open(src_path,"wb") 
+        if mode == "append":
+            self._file = open(src_path,"ab")
+        if mode == "read":
+            self._file = open(src_path,"rb")
+        return self
+
+    def close(self):
+        self._file.close()
+        return
+
+    def read(self):
+        keyLen=b2i(self._file.read(INT_LEN))
+        key=str(self._file.read(keyLen))
+        valueLen=b2i(self._file.read(INT_LEN))
+        value=str(self._file.read(valueLen))
+        return key,value
+
+    def seekToFirst(self):
+        self._file.seek(0)
+        return
+    
+    #Don't do this
+    def seek(self,offset):
+        self._file.seek(offset)
+        return
+
+    def write(self,key,value):
+        key_len = len(key)
+        value_len = len(value)
+        self._file.write(i2b(key_len)+key+i2b(value_len)+value)
+        return
+
+    def flush(self):
+        self._file.flush()
+        return
+
+    def __init__(self ):
+
+        return
+#integer to binary Q means long long, 8 bytes
+def i2b(i):
+    return struct.pack("<Q",i)
+#binary to integer
+def b2i(b):
+    return struct.unpack("<Q",b)[0]
+
+if __name__=='__main__':
+    store=FileStore()
+    store.open("test","create")
+    store.write("Hello","world!")
+    store.flush()
+    store.close()
+    
+    store.open("test","read")
+    key,value=store.read()
+    print key, value
+    store.close()
+    
+    store.open("test","append")
+    store.write("Foo","Bar")
+    store.flush()
+    store.close()
+ 
+    store.open("test","read")
+    key,value=store.read()
+    print key,value
+    key,value=store.read()
+    print key,value
+    store.seekToFirst()
+    key,value=store.read()
+    print key,value
+    store.close()
+    
+    os.remove("test")
diff --git a/tool/python/singa/utils/utility.py b/tool/python/singa/utils/utility.py
index b88720cac0..459b593119 100644
--- a/tool/python/singa/utils/utility.py
+++ b/tool/python/singa/utils/utility.py
@@ -84,3 +84,13 @@ def setval(proto, **kwargs):
                 else:
                     setattr(proto, key, val)
 
+def swap32(x):
+    return (((x << 24) & 0xFF000000) |
+            ((x <<  8) & 0x00FF0000) |
+            ((x >>  8) & 0x0000FF00) |
+            ((x >> 24) & 0x000000FF))
+
+def blob_to_numpy(blob):
+    '''TODO This method transform blob data to python numpy array 
+    '''
+    pass