diff --git a/code/cuda/RC-FINAL-5/Makefile b/code/cuda/RC-FINAL-5/Makefile
new file mode 100644
index 0000000..814bc14
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/Makefile
@@ -0,0 +1,73 @@
+CC	 	= g++
+NVCC	 	= nvcc
+LIBS    	= -L/usr/local/cuda-8.0/lib64 -lm -lz -lcuda -lcudart -lcublas -lcusparse -lcurand -lpthread -m64
+NVCCFLAGS  	= -I. -I/usr/local/cuda-8.0/include -arch=sm_60
+CFLAGS  	= -I. -I/usr/local/cuda-8.0/include -Wall -funroll-loops -fstrict-aliasing -O3
+DEBUGFLAGS	= -D__debug__ -D__STATISTICS__
+
+DEFS    	= $(CFLAGS) $(DEBUGFLAGS)
+NVCCDEFS 	= $(NVCCFLAGS) $(DEBUGFLAGS)
+FLAG    	= $(DEFS) $(INCS) $(LIBS)
+NVCCFLAG 	= $(NVCCDEFS) $(LIBS)
+
+OBJ		=	utils.o dataset.o cuda_utils.o logistic_fn_indicator.o \
+			mat_functions.o cuda_environment.o linesearch.o \
+			conjugate_gradient.o newton_cg.o newton-driver.o print_utils.o \
+			softmax_multiclass.o gen_random.o sparse_dataset.o \
+			subsampling_helpers.o classification_kernels.o
+
+all:   beta
+beta: $(OBJ) Makefile
+	$(NVCC) $(OBJ) -o NewtonCGSolver $(NVCCFLAG)
+
+utils.o: utils.h
+	$(CC) $(DEFS) -c utils.c
+
+dataset.o: dataset.h
+	$(CC) $(DEFS) -c dataset.c
+
+cuda_utils.o: cuda_utils.h
+	$(CC) $(DEFS) -c cuda_utils.c
+
+logistic_fn_indicator.o: logistic_fn_indicator.h
+	$(NVCC) $(NVCCDEFS) -c logistic_fn_indicator.cu
+
+mat_functions.o: mat_functions.h
+	$(NVCC) $(NVCCDEFS) -c mat_functions.cu
+
+cuda_environment.o: cuda_environment.h
+	$(CC) $(DEFS) -c cuda_environment.c
+
+linesearch.o: linesearch.h
+	$(CC) $(DEFS) -c linesearch.c
+
+conjugate_gradient.o: conjugate_gradient.h
+	$(CC) $(DEFS) -c conjugate_gradient.c
+
+newton_cg.o: newton_cg.h
+	$(CC) $(DEFS) -c newton_cg.c
+
+newton-driver.o:
+	$(CC) $(DEFS) -c newton-driver.c
+
+print_utils.o:
+	$(CC) $(DEFS) -c print_utils.c
+
+softmax_multiclass.o: softmax_multiclass.h
+	$(NVCC) $(NVCCDEFS) -c softmax_multiclass.cu
+
+sparse_dataset.o: sparse_dataset.h
+	$(NVCC) $(NVCCDEFS) -c sparse_dataset.cu
+
+gen_random.o: gen_random.h
+	$(NVCC) $(NVCCDEFS) -c gen_random.cu
+
+subsampling_helpers.o: subsampling_helpers.h
+	$(NVCC) $(NVCCDEFS) -c subsampling_helpers.cu
+
+classification_kernels.o:  classification_kernels.h
+	$(NVCC) $(NVCCDEFS) -c classification_kernels.cu
+
+clean:
+	rm -f *.o *~ core 
+	rm NewtonCGSolver
diff --git a/code/cuda/RC-FINAL-5/classification_kernels.cu b/code/cuda/RC-FINAL-5/classification_kernels.cu
new file mode 100644
index 0000000..2828321
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/classification_kernels.cu
@@ -0,0 +1,143 @@
+
+#include "classification_kernels.h"
+
+__device__ __inline__ double my_shfl(double x, int lane)
+{
+        // Split the double number into 2 32b registers.
+        int lo, hi; 
+        asm volatile( "mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(x));
+
+        // Shuffle the two 32b registers.
+        lo = __shfl_xor(lo, lane);
+        hi = __shfl_xor(hi, lane);
+
+        // Recreate the 64b number.
+        //asm volatile( "mov.b64 %0, {%1,%2};" : "=d(x)" : "r"(lo), "r"(hi));
+        //return x;
+        return __hiloint2double( hi, lo);
+}
+
+__device__ __inline__ double warpSum( double x )
+{
+        for (int offset = WARP_SIZE/2; offset > 0; offset /= 2)
+                x += my_shfl( x, offset);
+        return x;
+}
+
+GLOBAL void reduce(const real *input, real *results, const size_t count) {
+        extern __shared__ real my_results[];
+        unsigned int lane = threadIdx.x >> 5;
+        unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+        real sdata;
+        real x = 0;
+
+        sdata = 0;
+        my_results[ lane ] = 0;
+        if(idx < count) x = input [idx];
+        sdata = x;
+
+        sdata = warpSum ( sdata );
+        if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata;
+        __syncthreads ();
+
+        if (blockDim.x/WARP_SIZE == 0)
+         sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0;
+        else
+         sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0;
+        __syncthreads ();
+
+        if (lane == 0) sdata = warpSum( sdata );
+        if(threadIdx.x == 0) results [ blockIdx.x  ] =  sdata;
+}
+
+
+GLOBAL void ker_init_scaleTerms ( real *scaleTerms, int sampleSize, real *probs, int *indices )
+{
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (myRowId < sampleSize){
+                scaleTerms[ myRowId ] = probs[ indices[ myRowId ] ] ;
+        }
+}
+
+
+GLOBAL void ker_compute_probs( real *probs, int rows, int sampleSize, real *randVec, real *indices)
+{
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (myRowId < rows ){
+                probs[ myRowId ] *= sampleSize;
+                if (probs[ myRowId ] > 1.0) probs[ myRowId ] = 1.0;
+
+                if (randVec[ myRowId ] < probs[ myRowId ] )
+                        indices[ myRowId ] = 1;
+                else
+                        indices[ myRowId ] = 0;
+        }
+}
+
+GLOBAL void ker_compute_dHXW_nrm_log (real *dHXW, real *rowNrms, int rows)
+{
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (myRowId < rows) {
+                dHXW[ myRowId ] = abs( dHXW[ myRowId ] * (1. - dHXW[ myRowId ]) ) * rowNrms[ myRowId ];
+        }
+}
+
+
+GLOBAL void ker_normalize (real *dHXW, int rows, real *nrmConstant, real *probs ){
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (myRowId < rows){
+                probs[ myRowId ] = dHXW[ myRowId ] / nrmConstant[0];
+        }
+}
+
+GLOBAL void ker_row_norms( real *features, int rows, int cols, real *nrm )
+{
+        int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x );
+        int i = 0;
+        real sum = 0;
+
+        if (myRowId < rows) {
+                i = myRowId;
+                for (int j = 0; j < cols; j += 1)
+                        sum += pow( features[ j * rows + i ], 2.);
+
+                nrm[ i ] = sqrt( sum );
+        }
+}
+
+GLOBAL void ker_sqr_elements ( real *ptr, int nnz, int elems_per_thread, real *results )
+{
+        int myID = blockIdx.x * blockDim.x + threadIdx.x ;
+        int i = 0;
+
+        if (myID < nnz) {
+                i = myID;
+                ptr[ i ] *= ptr[ i ];
+        }
+
+}
+
+
+GLOBAL void ker_sqrt_elements (real *ptr, int count )
+{
+        int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x );
+        int i = 0;
+
+        if (myRowId < count ){
+                i = myRowId;
+                ptr[ i ] = sqrt( ptr[ i ] );
+        }
+}
+
+GLOBAL void ker_init_ones (real *ptr, int count )
+{
+        int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x );
+        int i = 0;
+
+        if (myRowId < count ){
+                i = myRowId;
+                ptr[ i ] = 1.0;
+        }
+}
diff --git a/code/cuda/RC-FINAL-5/classification_kernels.h b/code/cuda/RC-FINAL-5/classification_kernels.h
new file mode 100644
index 0000000..488fb09
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/classification_kernels.h
@@ -0,0 +1,20 @@
+#ifndef __H_CLASSIFICATION_KERNELS__
+#define __H_CLASSIFICATION_KERNELS__
+
+#include "cuda_types.h"
+
+__device__ __inline__ double my_shfl(double x, int lane);
+__device__ __inline__ double warpSum( double x );
+
+GLOBAL void reduce(const real *input, real *results, const size_t count) ;
+GLOBAL void ker_init_scaleTerms ( real *scaleTerms, int sampleSize, real *probs, int *indices );
+GLOBAL void ker_compute_probs( real *probs, int rows, int sampleSize, real *randVec, real *indices);
+GLOBAL void ker_compute_dHXW_nrm_log (real *dHXW, real *rowNrms, int rows);
+GLOBAL void ker_normalize (real *dHXW, int rows, real *nrmConstant, real *probs );
+GLOBAL void ker_row_norms( real *features, int rows, int cols, real *nrm );
+GLOBAL void ker_sqr_elements ( real *ptr, int nnz, int elems_per_thread, real *results );
+GLOBAL void ker_sqrt_elements (real *ptr, int count );
+GLOBAL void ker_init_ones (real *ptr, int count );
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/conjugate_gradient.c b/code/cuda/RC-FINAL-5/conjugate_gradient.c
new file mode 100644
index 0000000..982cb2b
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/conjugate_gradient.c
@@ -0,0 +1,309 @@
+#include "cuda_types.h"
+#include "conjugate_gradient.h"
+#include "cuda_utils.h"
+#include "print_utils.h"
+
+#include "softmax_multiclass.h"
+#include "logistic_fn_indicator.h"
+
+#include "float.h"
+#include "time.h"
+#include "stdlib.h"
+#include "subsampling_helpers.h"
+#include "sparse_dataset.h"
+
+int Cublas_CG_Logistic( DeviceDataset *data, NEWTON_CG_PARAMS *params, 
+		real *p_gradient, real *x, real *x_best, real *rel_residual, 
+		real *devPtr, real *hostPtr, real *pgeLckPtr)
+{
+	real *Hg, *residual, *p, *gradient;
+	real *rsold, *rsnew, *alpha, *tmp;
+	real *nextHostPtr, *nextDevPtr; 
+	real gradient_norm;
+	real best_rel_residual;
+
+	real *B, *probs, *scaleTerms, *rowNrms; 
+	int *selIndices;
+
+	int i;
+
+	Hg = devPtr;
+	residual = Hg + data->cols;
+	p = residual + data->cols;
+	gradient = p + data->cols;
+
+   B = gradient + data->cols;    
+   probs = B + data->rows;
+   scaleTerms = probs + data->rows;
+   rowNrms = scaleTerms + data->rows;
+   nextDevPtr = rowNrms + data->rows;
+	
+
+	rsold = pgeLckPtr;
+	rsnew = &pgeLckPtr[1];
+	alpha = &pgeLckPtr[2];
+	tmp = &pgeLckPtr[3];
+
+   selIndices = (int *)hostPtr;
+   nextHostPtr = hostPtr + data->rows;
+
+
+	//Perform the sampling for Hessian here. 
+	if (params->hx_sampling >= 1) {	
+
+		data->hessianSampleSize = (HESSIAN_SAMPLING_SIZE * data->rows) / 100; 
+
+		prepareForSampling( &data->spHessianSample, NULL, NULL, data->rows, data->hessianSampleSize, (int *)nextHostPtr );
+		data->spHessianSample.nnz = data->hessianSampleSize; 
+
+		//sample Hessian Here. 
+		if (data->spTrain.valPtr == NULL) { 
+			//Dense Case
+			convertHessianSampleToCSR( &data->spHessianSample, data->hessianSampleSize, data->cols, nextDevPtr ); 
+			sampleDataset (&data->spHessianSample, data->trainSet, data->rows, data->cols, data->numclasses, data->sampledHessianTrainSet, data->hessianSampleSize); 
+		} else { 
+			//Sparse Case
+			convertHessianSampleToCSR( &data->spHessianSample, data->hessianSampleSize, data->cols, nextDevPtr ); 
+			sampleSparseDataset( &data->spHessianSample, &data->spTrain, data->rows, data->cols, data->numclasses, 
+					&data->spSampledHessianTrain, data->hessianSampleSize ); 
+		}
+
+		logistic_fn_indicator_hx_matvec( data->sampledHessianTrainSet, &data->spSampledHessianTrain, data->weights, x, params->lambda, data->hessianSampleSize, data->cols, Hg, nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); 
+	}
+	else {
+		logistic_fn_indicator_hx_matvec( data->trainSet, &data->spTrain, data->weights, x, params->lambda, data->rows, data->cols, Hg, nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); 
+	}
+
+	*alpha = -1;
+	cublasCheckError (cublasDcopy( cublasHandle, data->cols, p_gradient, 1, gradient, 1) );
+	cublasCheckError (cublasDscal( cublasHandle, data->cols, alpha, gradient, 1) );
+
+
+	// residual = g - H*g;
+	cublasCheckError (cublasDcopy( cublasHandle, data->cols, gradient, 1, residual, 1) );
+	*alpha = -1;
+	cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, Hg, 1, residual, 1 ) );
+
+	//p = residual;
+	cublasCheckError (cublasDcopy( cublasHandle, data->cols, residual, 1, p, 1) );
+
+	//rsold = Dot( residual, residual, N );
+	cublasCheckError (cublasDdot( cublasHandle, data->cols, residual, 1, residual, 1, rsold ) ); 
+	
+	cublasCheckError( cublasDnrm2( cublasHandle, data->cols, gradient, 1, &gradient_norm) ); 
+	best_rel_residual = SQRT( *rsold ) / gradient_norm; 
+	cudaMemcpy( x_best, x, data->cols * sizeof(real), cudaMemcpyDeviceToDevice ); 
+
+	for( i = 0; i < params->max_cg_iterations; ++i ) {
+		//hessian vec here
+		if (params->hx_sampling > 0) {
+			logistic_fn_indicator_hx_matvec( data->sampledHessianTrainSet, &data->spSampledHessianTrain, 
+							data->weights, p, params->lambda, data->hessianSampleSize, data->cols, Hg, 
+							nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); 
+		} else {
+			logistic_fn_indicator_hx_matvec( data->trainSet, &data->spTrain, 
+							data->weights, p, params->lambda, data->rows, data->cols, Hg, 
+							nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); 
+		}
+
+		//tmp = Dot( Hg, p, N );
+		cublasCheckError (cublasDdot( cublasHandle, data->cols, Hg, 1, p, 1, tmp ) ); 
+		*alpha = -1. * ((*rsold) / (*tmp));    
+
+		//Vector_Add( residual, -alpha, Hg, N ); //residual = residual - alpha * Hg
+		cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, Hg, 1, residual, 1 ) );
+
+		*alpha *= -1.;
+		//Vector_Add( x, alpha, p ); x = x + alpha * p
+		cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, p, 1, x, 1 ) );
+
+		//rsnew = Dot (residual, residual);
+		cublasCheckError (cublasDdot( cublasHandle, data->cols, residual, 1, residual, 1, rsnew ) );
+
+		*rel_residual =  SQRT( *rsnew ) / gradient_norm; 
+
+		if (*rel_residual < best_rel_residual) {
+			best_rel_residual = *rel_residual; 
+			cudaMemcpy( x_best, x, data->cols * sizeof(real), cudaMemcpyDeviceToDevice ); 
+		}
+		if (*rel_residual <= params->cg_tolerance) break;
+
+		//p = residual + (rsnew / rsold) * p;
+		*alpha = (*rsnew/(*rsold));
+		cublasCheckError (cublasDscal( cublasHandle, data->cols, alpha, p, 1) );
+
+		*alpha = 1;
+		cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, residual, 1, p, 1 ) );
+	 	*rsold = *rsnew;
+  	}
+
+	*rel_residual = best_rel_residual;
+
+	return i;
+}
+
+int Cublas_CG_multi_optimized(SparseDataset *spfeatures,  real *features, real *g, real *weights, 
+				real *x, real *x_best, real lambda, int rows, int cols, int numclasses, real *HXW, 
+				real *devPtr, real *hostPtr, real *pgeLckPtr, int MAX_ITERATIONS, 
+				real tolerance, real *rel_residual, real *best_rel_residual, 
+				SparseDataset *spSampledHessian, real *sampledHessian, 
+				SparseDataset *spSampledHessianTrainSet, int hessianSampleSize, int samplingType )
+{
+
+	//CG local's Here
+	real *p, *r, *h, *alpha, *pAp;
+	real  rnorm, gradient_norm, tol2, delta, bb, prev_delta; 
+	int iter;
+
+	//Other Locals Here
+	real *Hg, *B;
+	real *nextDevPtr, *nextHostPtr, *nextPageLckPtr; 
+
+   int *selIndices, nonUniformSampleSize, sampleSize;
+   real *rowNrms, *probs, *scaleTerms;
+
+	//Device Pointers
+	Hg = devPtr;
+	r = Hg + numclasses * cols;
+	p = r + numclasses * cols;
+	B = p + numclasses * cols; 	
+   probs = B + rows * numclasses;
+   scaleTerms = probs + rows;
+   rowNrms = scaleTerms + rows;
+	h = rowNrms + numclasses * cols;	
+   nextDevPtr = h + rows;
+	
+	//PageLock Pointers
+	alpha = &pgeLckPtr[0];
+	pAp = &pgeLckPtr[1];
+	nextPageLckPtr = pAp + 1; 
+
+	//Host Only Pointers
+   selIndices = (int *)hostPtr;
+   nextHostPtr = hostPtr + rows;
+
+
+	//Initializations here. 
+	sampleSize = hessianSampleSize; 
+
+	if (samplingType >= 1) {	
+
+		if (samplingType == 1) { 
+			sampleSize = hessianSampleSize; 
+			prepareForSampling( spSampledHessian, NULL, NULL, rows, sampleSize, (int *)nextHostPtr );
+		} else {
+    		computeHXW( spfeatures, features, rows, cols, numclasses, weights, B, 0 );
+      	computeRowNorms( spfeatures, features, rows, cols, rowNrms, nextDevPtr );
+      	computeRowProbabilities( spfeatures, features, rows, cols, numclasses, B, rowNrms, probs, nextDevPtr );
+      	nonUniformSampleSize = generateNonUniformSample( probs, scaleTerms, rows, hessianSampleSize, selIndices, nextDevPtr, nextHostPtr );
+
+			sampleSize = nonUniformSampleSize; 
+        	prepareForNonUniformSampling( spSampledHessian, sampleSize, selIndices );
+		}
+		spSampledHessian->nnz = sampleSize; 
+
+		//sample Hessian Here. 
+		if (features) { 
+			convertHessianSampleToCSR( spSampledHessian, sampleSize, cols, nextDevPtr ); 
+			sampleDataset (spSampledHessian, features, rows, cols, numclasses, sampledHessian, sampleSize ); 
+		} else { 
+			convertHessianSampleToCSR( spSampledHessian, sampleSize, cols, nextDevPtr ); 
+			sampleSparseDataset( spSampledHessian, spfeatures, rows, cols, numclasses, 
+					spSampledHessianTrainSet, sampleSize ); 
+		}
+
+		softmax_multiclass_hx_subsampled(spfeatures,  features, rows, cols, numclasses, 
+			weights, x, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW,
+			spSampledHessian, sampledHessian, spSampledHessianTrainSet, sampleSize, scaleTerms, samplingType ); 
+	}
+	else {
+      softmax_multiclass_hx_optimized(spfeatures,  features, rows, cols, numclasses,
+      	weights, x, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW );
+	}
+	
+
+	//tol2 = tol^2
+	tol2 = pow( tolerance, 2. );
+
+	// r = g - H*g;
+	cublasCheckError (cublasDcopy( cublasHandle, numclasses * cols, g, 1, r, 1) );
+	*alpha = -1;
+	cublasCheckError (cublasDaxpy( cublasHandle, numclasses * cols, alpha, Hg, 1, r, 1 ) );
+
+	//h = Precondition( P, r)
+	cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, r, 1, h, 1) );
+
+	//delta = r' * h
+	cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, r, 1, h, 1, &delta ) );
+
+	//bb = b' * Preconditioned( P, b)
+	cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, g, 1, g, 1, &bb ) );
+
+	//p = r;
+	cublasCheckError (cublasDcopy( cublasHandle, numclasses * cols, r, 1, p, 1) );
+
+	//Store the best result to return
+	*best_rel_residual = DBL_MAX;
+	cudaMemcpy( x_best, x, numclasses * cols * sizeof(real), cudaMemcpyDeviceToDevice ); 
+
+	iter = 0;
+	cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, g, 1, &gradient_norm) ); 
+	cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, r, 1, &rnorm) ); 
+	*rel_residual =  rnorm / gradient_norm; 
+
+	while ( (delta > tol2 * bb) && (iter < MAX_ITERATIONS) && (*rel_residual > tolerance) ) {
+
+		if (samplingType != 0) {
+			softmax_multiclass_hx_subsampled(spfeatures,  features, rows, cols, numclasses, 
+				weights, p, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW, 
+				spSampledHessian, sampledHessian, spSampledHessianTrainSet, sampleSize, scaleTerms, samplingType ); 
+		}
+		else {
+         softmax_multiclass_hx_optimized(spfeatures,  features, rows, cols, numclasses,
+         	weights, p, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW );
+		}
+
+		//pAp = Dot( Hg, p, N );
+		cublasCheckError (cublasDdot( cublasHandle, numclasses * cols, Hg, 1, p, 1, pAp ) ); 
+
+		//alpha = delta / pAp
+		*alpha = -1. * (delta / (*pAp) );    
+
+		//r = r - alpha * Ap
+		cublasCheckError (cublasDaxpy( cublasHandle, numclasses * cols, alpha, Hg, 1, r, 1 ) );
+
+		// x = x + alpha * p
+		*alpha *= -1.;
+		cublasCheckError (cublasDaxpy( cublasHandle, numclasses * cols, alpha, p, 1, x, 1 ) );
+
+		// rel_res = norm(r) / norm(b)
+		cublasCheckError (cublasDnrm2( cublasHandle, numclasses * cols, r, 1, &rnorm) );
+		*rel_residual =  rnorm / gradient_norm; 
+
+		if (*rel_residual < *best_rel_residual) {
+			*best_rel_residual = *rel_residual; 
+			cudaMemcpy( x_best, x, numclasses * cols * sizeof(real), cudaMemcpyDeviceToDevice ); 
+		}
+
+		//h = r
+		cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, r, 1, h, 1) );
+
+		prev_delta = delta; 
+
+		//delta = r' * h
+		cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, r, 1, h, 1, &delta) ); 
+
+		//p = h + (delta/prev_delta) * p;
+		*alpha = delta / prev_delta;
+		cublasCheckError( cublasDscal( cublasHandle, numclasses * cols, alpha, p, 1) );
+		*alpha = 1; 
+		cublasCheckError( cublasDaxpy( cublasHandle, numclasses * cols, alpha, h, 1, p, 1) ); 
+
+		//increment the iteration count here
+		cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, r, 1, &rnorm) ); 
+		*rel_residual =  rnorm / gradient_norm; 
+		iter += 1;
+  	}
+
+	return iter;
+}
diff --git a/code/cuda/RC-FINAL-5/conjugate_gradient.h b/code/cuda/RC-FINAL-5/conjugate_gradient.h
new file mode 100644
index 0000000..a14ad92
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/conjugate_gradient.h
@@ -0,0 +1,16 @@
+#ifndef _H_CONJUGATE_GRADIENT__
+#define _H_CONJUGATE_GRADIENT__
+
+#include <cuda_types.h>
+#include <dataset.h>
+#include <newton_cg.h>
+
+
+int Cublas_CG_Logistic( DeviceDataset *data, NEWTON_CG_PARAMS *params,
+                real *g, real *x, real *x_best, real *rel_residual,
+                real *devPtr, real *hostPtr, real *pgeLckPtr);
+int Cublas_CG_multi_optimized (SparseDataset *,  real *, real *, real *, real *, real *, real, int , int , int ,
+                        real *, real *, real *, real *, int , real, real *, real *, 
+			SparseDataset *, real *, SparseDataset *, int, int );
+#endif
+
diff --git a/code/cuda/RC-FINAL-5/cuda_environment.c b/code/cuda/RC-FINAL-5/cuda_environment.c
new file mode 100644
index 0000000..bad24db
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/cuda_environment.c
@@ -0,0 +1,36 @@
+#include <cuda_types.h>
+#include <cuda_utils.h>
+#include <utils.h>
+
+#include <cuda_environment.h>
+
+#include <stdlib.h>
+#include <time.h>
+
+void cuda_env_init(SCRATCH_AREA *scratch, int gpu){
+	//cudaSetDevice (0);
+	cudaSetDevice (gpu);
+	cudaCheckError ();
+
+        cudaDeviceReset ();
+        cudaDeviceSynchronize ();
+
+	allocate_memory( (void **)&scratch->hostWorkspace, (size_t)HOST_WORKSPACE_SIZE );
+	cuda_malloc( (void **)&scratch->devWorkspace, DEVICE_WORKSPACE_SIZE, 1, ERR_MEM_ALLOC  );
+	cuda_malloc_host ((void **)&scratch->pageLckWorkspace, PAGE_LOCKED_WORKSPACE_SIZE, 0, ERR_MEM_ALLOC );
+
+	cublasCheckError( cublasCreate( &cublasHandle ) );
+	cusparseCheckError( cusparseCreate( &cusparseHandle ) );
+
+	allocate_memory( (void **)&dscratch, (size_t)DEBUG_SCRATCH_SIZE);
+
+	srand( time(NULL) ); 
+}
+
+void cuda_env_cleanup (SCRATCH_AREA *scratch){
+	release_memory( (void **)&scratch->hostWorkspace );
+	cuda_free ((void *)scratch->devWorkspace, ERR_MEM_FREE);
+	cuda_free_host ( (void *)scratch->pageLckWorkspace, ERR_MEM_FREE );
+
+	release_memory( (void **)&dscratch);
+}
diff --git a/code/cuda/RC-FINAL-5/cuda_environment.h b/code/cuda/RC-FINAL-5/cuda_environment.h
new file mode 100644
index 0000000..65d1824
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/cuda_environment.h
@@ -0,0 +1,9 @@
+#ifndef _H_CUDA_ENVIRONMENT__
+#define _H_CUDA_ENVIRONMENT__
+
+#include "cuda_types.h"
+
+void cuda_env_init (SCRATCH_AREA *, int);
+void cuda_env_cleanup (SCRATCH_AREA *);
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/cuda_types.h b/code/cuda/RC-FINAL-5/cuda_types.h
new file mode 100644
index 0000000..c37c355
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/cuda_types.h
@@ -0,0 +1,76 @@
+#ifndef _H_CUDA_TYPES__
+#define _H_CUDA_TYPES__
+
+#include "cuda.h"
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+
+#include "cuda_runtime.h"
+#include "cuda_runtime_api.h"
+#include "device_launch_parameters.h"
+#include "host_defines.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#define HOST __host__
+#define DEVICE __device__
+#define GLOBAL __global__
+#define HOST_DEVICE __host__ __device__
+
+
+#define real double
+#define SQRT sqrt
+
+#define HOST_WORKSPACE_SIZE		((1 * 1024 * 1024 * 1024) + (512 * 1024 * 1024))
+//#define DEVICE_WORKSPACE_SIZE		((1 * 1024 * 1024 * 1024) + (512 * 1024 * 1024))
+#define DEVICE_WORKSPACE_SIZE		1 * 1024 * 1024 * 1024
+#define PAGE_LOCKED_WORKSPACE_SIZE	1024 * 1024
+
+#define DEBUG_SCRATCH_SIZE		10 * 1024 * 1024
+
+#define ERROR_MEM_ALLOC			0x01
+#define ERROR_MEM_CLEANUP		0x02
+#define ERROR_MEMCPY_DEVICE_HOST	0x03
+
+#define ERR_MEM_ALLOC			0x04
+#define ERR_MEM_FREE			0x05
+
+#define ERROR_MEMCPY_TRAINSET		0x06
+#define ERROR_MEMCPY_TESTSET		0x07
+#define ERROR_MEMCPY_TRAINLABELS 	0x08
+#define ERROR_MEMCPY_TESTLABELS 	0x09
+
+#define ERROR_DEBUG			0x10
+#define ERROR_MEM_SET			0x11
+
+#define ERROR_MEMCPY_DEVICE_DEVICE	0x12
+#define ERROR_MEMCPY_HOST_DEVICE	0x13
+
+#define CUDA_BLOCK_SIZE			1024
+
+#define WARP_SIZE			32
+#define THREADS_PER_ROW			64
+
+
+//#define HESSIAN_SAMPLING_SIZE		1
+//#define GRADIENT_SAMPLING_SIZE		5
+//#define HESSIAN_SAMPLING_SIZE		25
+//#define GRADIENT_SAMPLING_SIZE		50
+
+extern int BLOCKS, BLOCK_SIZE, BLOCKS_POW_2;
+extern int HESSIAN_SAMPLING_SIZE, GRADIENT_SAMPLING_SIZE; 
+
+
+extern cublasHandle_t cublasHandle;
+extern cusparseHandle_t cusparseHandle; 
+typedef struct scratch_space{ 
+	real *hostWorkspace; 
+	real *devWorkspace;
+	real *pageLckWorkspace;
+	} SCRATCH_AREA;
+
+extern void* dscratch;
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/cuda_utils.c b/code/cuda/RC-FINAL-5/cuda_utils.c
new file mode 100644
index 0000000..baf627f
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/cuda_utils.c
@@ -0,0 +1,145 @@
+
+#include "cuda_utils.h"
+#include "cuda_types.h"
+
+void cuda_malloc (void **ptr, unsigned int size, int memset, int err_code) {
+
+    cudaError_t retVal = cudaSuccess;
+    retVal = cudaMalloc (ptr, size);
+    if (retVal != cudaSuccess) {
+		fprintf (stderr, "Failed to allocate memory on device for the res: %d...  exiting with code: %d size: %d, %s \n", 
+							err_code, retVal, size, cudaGetErrorString(retVal));
+		exit (err_code);
+    }  
+
+    if (memset) {
+        retVal = cudaMemset (*ptr, 0, size);
+        if (retVal != cudaSuccess) {
+			fprintf (stderr, "Failed to memset memory on device... exiting with code %d, %s\n", 
+							err_code, cudaGetErrorString( retVal ));
+			exit (err_code);
+        }
+    }  
+}
+
+void cuda_malloc_host (void **ptr, unsigned int size, int memset, int err_code) {
+
+    cudaError_t retVal = cudaSuccess;
+    retVal = cudaMallocHost (ptr, size);
+    if (retVal != cudaSuccess) {
+		fprintf (stderr, "Failed to allocate memory on device for the res: %d...  exiting with code: %d size: %d, %s \n", 
+							err_code, retVal, size, cudaGetErrorString(retVal) );
+		exit (err_code);
+    }  
+
+    if (memset) {
+        retVal = cudaMemset (*ptr, 0, size);
+        if (retVal != cudaSuccess) {
+			fprintf (stderr, "Failed to memset memory on device... exiting with code %d, %s\n", 
+							err_code, cudaGetErrorString( retVal ));
+			exit (err_code);
+        }
+    }  
+}
+
+
+
+void cuda_free (void *ptr, int err_code) {
+
+    cudaError_t retVal = cudaSuccess;
+    if (!ptr) return;
+
+    retVal = cudaFree (ptr);
+
+    if (retVal != cudaSuccess) {
+		fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld, %s\n", 
+						err_code, retVal, (long int)ptr, cudaGetErrorString( retVal ));
+        return;
+    }  
+}
+
+void cuda_free_host (void *ptr, int err_code) {
+
+    cudaError_t retVal = cudaSuccess;
+    if (!ptr) return;
+
+    retVal = cudaFreeHost (ptr);
+
+    if (retVal != cudaSuccess) {
+		fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld, %s\n", 
+						err_code, retVal, (long int)ptr, cudaGetErrorString( retVal ));
+        return;
+    }  
+}
+
+
+void cuda_memset (void *ptr, int data, size_t count, int err_code){
+    cudaError_t retVal = cudaSuccess;
+
+    retVal = cudaMemset (ptr, data, count);
+    if (retVal != cudaSuccess) {
+	 	fprintf (stderr, "ptr passed is %ld, value: %ld \n", (long int)ptr, &ptr);
+	 	fprintf (stderr, " size to memset: %ld \n", count);
+		fprintf (stderr, " target data is : %d \n", data);
+		fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d, %s\n", 
+							err_code, retVal, cudaGetErrorString( retVal ));
+		exit (err_code);
+    }
+}
+
+void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid)
+{
+	cudaError_t	retVal = cudaErrorNotReady;
+
+	if (dir == cudaMemcpyHostToDevice)
+		retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice);
+	else
+		retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost);
+
+	if (retVal != cudaSuccess) {
+		fprintf (stderr, "could not copy resource %d from host to device: reason %d:%s \n",
+							resid, retVal, cudaGetErrorString( retVal ));
+		exit (resid);
+	}
+}
+
+void copy_device (void *dest, void *src, int size, int resid)
+{
+	cudaError_t	retVal = cudaErrorNotReady;
+
+	retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice);
+	if (retVal != cudaSuccess) {
+		fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
+							resid, retVal);
+		exit (resid);
+	}
+}
+
+void print_device_mem_usage ()
+{
+   size_t total, free;
+   cudaMemGetInfo (&free, &total);
+   if (cudaGetLastError () != cudaSuccess )
+   {
+      fprintf (stderr, "Error on the memory call \n");
+		return;
+   }
+
+   fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
+                     total, total/(1024*1024), total/ (1024*1024*1024), 
+                     free, free/(1024*1024), free/ (1024*1024*1024) );
+}
+
+void compute_blocks ( int *blocks, int *block_size, int count )
+{
+        *block_size = CUDA_BLOCK_SIZE;
+        *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1);
+}
+
+void compute_nearest_pow_2 (int blocks, int *result)
+{
+        int power = 1;
+        while (power < blocks) power *= 2;
+
+        *result = power;
+}
diff --git a/code/cuda/RC-FINAL-5/cuda_utils.h b/code/cuda/RC-FINAL-5/cuda_utils.h
new file mode 100644
index 0000000..e864e60
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/cuda_utils.h
@@ -0,0 +1,88 @@
+#ifndef	__CUDA_UTILS_H_
+#define __CUDA_UTILS_H_
+
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+#include "stdlib.h"
+#include "stdio.h"
+#include "curand.h"
+
+
+void cuda_malloc (void **, unsigned int , int , int);
+void cuda_malloc_host( void **, unsigned int, int, int );
+
+void cuda_free (void *, int);
+void cuda_free_host (void *, int);
+void cuda_memset (void *, int , size_t , int );
+
+void copy_host_device (void *, void *, int , enum cudaMemcpyKind, int);
+void copy_device (void *, void *, int , int );
+
+void print_device_mem_usage ();
+
+#define cusparseCheckError(cusparseStatus) __cusparseCheckError (cusparseStatus, __FILE__, __LINE__)
+inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line )
+{
+if (cusparseStatus!= CUSPARSE_STATUS_SUCCESS)
+{
+	//fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cusparseStatus);
+	fprintf (stderr, "failed .. %s:%d -- error code %d \n", file, line, cusparseStatus);
+	exit (-1);
+}
+return;
+}
+
+
+#define cublasCheckError(cublasStatus) __cublasCheckError (cublasStatus, __FILE__, __LINE__)
+inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line )
+{
+if (cublasStatus!= CUBLAS_STATUS_SUCCESS)
+{
+	fprintf (stderr, "failed .. %s:%d -- error code %d \n", file, line, cublasStatus);
+	exit (-1);
+}
+return;
+}
+
+#define cudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
+inline void __cudaCheckError( const char *file, const int line )
+{
+	cudaError err = cudaGetLastError();
+	if ( cudaSuccess != err )
+	{
+		fprintf (stderr, "Failed .. %s:%d -- gpu erro code %d:%s\n", file, line, err, cudaGetErrorString( err ) );
+		exit( -1 );
+	}
+ 
+	// More careful checking. However, this will affect performance.
+	// Comment away if needed.
+	/*
+	err = cudaDeviceSynchronize();
+	if( cudaSuccess != err )
+	{
+		exit( -1 );
+	}
+	*/
+	return;
+}
+
+#define curandCheckError(curandStatus) __curandCheckError (curandStatus, __FILE__, __LINE__)
+inline void __curandCheckError( curandStatus_t curandStatus, const char *file, const int line )
+{
+        if (curandStatus!= CURAND_STATUS_SUCCESS)
+        {
+                fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, curandStatus);
+                exit (-1);
+        }
+        return;
+}
+
+
+
+void compute_blocks ( int *blocks, int *block_size, int count );
+void compute_nearest_pow_2 (int blocks, int *result);
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/dataset.c b/code/cuda/RC-FINAL-5/dataset.c
new file mode 100644
index 0000000..91eca1b
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/dataset.c
@@ -0,0 +1,1217 @@
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <dataset.h>
+#include <cuda_utils.h>
+#include <utils.h>
+
+#include <print_utils.h>
+
+#define SAMPLING_BUFFER_EXTENSION	10
+
+#define MAX_LINE 	256 * 1024
+#define MAX_IDX 	256 * 1024
+
+#define HEAP_LINE_SIZE 4 * 1024 * 1024
+
+#define CIFAR_LINE_SIZE 	3073
+
+void swap (real *a, real *b, real *t){
+	*t = *a;
+	*a = *b;
+	*b = *t;
+}
+
+real findMaxInDataset( real *src, int rows, int cols )
+{
+	int maxval = 0; 
+	for (int i = 0; i < rows * cols; i ++)
+		if (maxval < src[ i ]) maxval = src[i]; 
+	return maxval; 
+}
+
+void preprocessDataset( real *src, int rows, int cols, real maxval)
+{
+	if (maxval > 0){ 
+		for (int i = 0; i < rows * cols; i ++)
+			//src[i] = maxval - src[i];
+			src[i] = src[i] - maxval;
+	}
+}
+
+void convertToColumnMajor (real *src, int rows, int cols, real *tgt ) {
+	for (int i = 0; i < rows; i ++ )
+		for (int j = 0; j < cols; j ++)
+			tgt[j * rows + i]  = src[i * cols + j];
+}
+
+void convertRowStochastic( real *src, int rows, int cols ) {
+	real sum = 0; 
+	for (int i = 0; i < rows; i ++ ) {
+		for (int j = 0; j < cols; j ++)
+			sum += src[ i * cols + j ];
+		for (int j = 0; j < cols; j ++)
+			src[ i * cols + j ] = src[ i * cols + j] / sum;
+	}
+}
+
+void convertColumnStochastic( real *src, int rows, int cols ){
+	real maxval = 0; 
+	for (int c = 0; c < cols; c ++){
+		maxval = src[ c * rows ]; 
+		for (int r = 1; r < rows; r ++){
+			if (maxval < src[ c * rows + r ])
+				maxval = src[ c * rows + r ];	
+		}
+
+		if (maxval > 1) {
+			for (int r = 1; r < rows; r ++){
+				src[ c * rows + r] /= maxval;
+			}
+		}
+		//fprintf( stderr, " Done with Column: %d, maxval: %f \n", c, maxval );
+	}
+}
+
+void columnNormalize( real *src, int rows, int cols, real *train, int tr ){
+        real norm = 0;
+        for (int c = 0; c < cols; c ++){
+                norm = pow( src[ c * rows ], 2. );
+                for (int r = 1; r < rows; r ++) {
+                        norm += pow( src[ c * rows + r ], 2. );
+                }
+                for (int r = 0; r < tr; r ++){
+                        norm += pow( src[ c * tr + r ], 2. );
+                }
+
+                if (norm < 1e-8) {
+                        norm = sqrt( norm );
+                        for (int r = 0; r < rows; r ++)
+                                src[ c * rows + r ] /= norm;
+
+                        for (int r = 0; r < tr; r ++)
+                                train[ c * tr + r ] /= norm;
+                }
+        }
+}
+
+real computeMaxValue (real *labels, int count ) {
+	real maxval = 0; 
+	for (int i = 0; i < count; i ++ ) 
+		if (maxval < labels[i] )
+			maxval = labels[i]; 
+
+	return maxval; 
+}
+
+void writeDataset( real *features, real *labels, int rows, int cols, char *filename, char *vectorname)
+{
+	FILE *dataset_file;
+
+	if ( (dataset_file = fopen(filename, "w")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+	
+	for (int i = 0; i < rows; i ++){
+		fprintf (dataset_file, "%4.6f", features[ i * cols ] );
+		for (int j = 1; j < cols; j ++){
+			fprintf( dataset_file, ",%4.6f", features[ i * cols + j ] );
+		}
+		fprintf( dataset_file, "\n");
+	}
+	fclose (dataset_file);
+
+	if ( (dataset_file = fopen(vectorname, "w")) == NULL ) { 
+		fprintf( stderr, "Error opening the labels.... !\n" );
+		exit( -1 );
+	}
+	
+	for (int i = 0; i < rows; i ++){
+		fprintf (dataset_file, "%d\n", (int)labels[ i ] );
+	}
+	fclose (dataset_file);
+}
+
+void readBinaryMatFile( char *f_train_features, char *f_train_labels, 
+				char *f_test_features, char *f_test_labels, 
+				ForestDataset *data, SCRATCH_AREA *s, int offset)
+{
+	FILE *dataset_file; 
+	char line[MAX_LINE]; 
+	int numLines = 0;
+	int NUM_CLASSES = 20;
+	size_t output; 
+	int idx = 0; 
+	int i;
+	real cols[3]; 
+	int max_train_col, max_test_col; 
+	int max_train_row, max_test_row; 
+
+	char filename[MAX_LINE]; 
+
+	real *scratch = s->hostWorkspace;
+
+	int *train_row_id, *train_col_id; 
+	int *test_row_id, *test_col_id; 
+
+	real *train_val, *train_vec; 
+	real *test_val, *test_vec; 
+	
+	int train_nnz, test_nnz; 
+
+	int cur_column; 
+	int *rowPtr, *colPtr; 
+	real *valPtr, *labelPtr; 
+	int rowNNZ; 
+	int minCol = 1000000; 
+
+	char *heapLine = (char *)malloc (HEAP_LINE_SIZE); 
+	
+	if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	max_train_row = max_train_col = 0; 
+	train_nnz = 0;
+	while (!feof( dataset_file) ){
+		memset( heapLine, 0, HEAP_LINE_SIZE);
+
+		fgets( heapLine, HEAP_LINE_SIZE, dataset_file);
+		if (heapLine[0] == 0) break;
+
+		cur_column = tokenize_binary_string( heapLine, 0, &train_nnz); 
+
+		if (max_train_col < cur_column) max_train_col = cur_column; 
+		if (minCol > cur_column) minCol = cur_column; 
+		
+		numLines ++;
+	}
+	max_train_row = numLines; 
+
+	fclose( dataset_file ); 
+	fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d), NNZ: %d, %d\n", 
+			numLines, max_train_row, max_train_col, train_nnz, minCol ); 
+
+	if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	max_test_row = max_test_col = numLines = 0; 
+	test_nnz = 0; 
+	minCol = 10000000; 
+	while (!feof( dataset_file) ){
+		memset( heapLine, 0, HEAP_LINE_SIZE);
+
+		fgets( heapLine, HEAP_LINE_SIZE, dataset_file);
+		cur_column = tokenize_binary_string( heapLine, 0, &test_nnz); 
+
+		if (max_test_col < cur_column) max_test_col = cur_column; 
+		if (minCol > cur_column) minCol = cur_column; 
+		
+		if (heapLine[0] == 0) break;
+		numLines ++;
+	}
+	max_test_row = numLines; 
+	fclose( dataset_file ); 
+
+	fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d ), NNZ: %d, %d \n", 
+			numLines, max_test_row, max_test_col, test_nnz, minCol ); 
+
+	if (max_train_col < max_test_col ){
+		fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_test_col );
+		fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_test_col );
+	} else {
+		fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_train_col );
+		fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_train_col );
+	}
+
+	//Read the matrices Here. 
+	train_row_id = (int *) malloc ( train_nnz * sizeof (int) ); 
+	train_col_id = (int *) malloc ( train_nnz * sizeof (int) ); 
+	train_val = (real *) malloc ( train_nnz * sizeof (real) ); 
+	train_vec = (real *) malloc ( max_train_row * sizeof (real) ); 
+
+	if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	rowPtr = train_row_id; 
+	colPtr = train_col_id; 
+	valPtr = train_val; 
+	labelPtr = train_vec; 
+	numLines = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		cols[0] = cols[1] = cols[2] = 0; 
+
+		fgets( line, MAX_LINE, dataset_file);
+		if (line[0] == 0) break;
+
+		rowNNZ = tokenize_binary_populate( line, 0, rowPtr, colPtr, valPtr, labelPtr, numLines ); 
+		rowPtr += rowNNZ; 
+		colPtr += rowNNZ; 
+		valPtr += rowNNZ; 
+
+		numLines ++;
+	}
+	fclose( dataset_file ); 
+
+	for (int i = 0; i < numLines; i ++) 
+		if (train_vec[i] == -1) train_vec[i] = 2; 
+
+	fprintf( stderr, "Done populating the training part ... \n"); 
+
+	//Read the test dataset here. 
+	test_row_id = (int *) malloc ( test_nnz * sizeof (int) ); 
+	test_col_id = (int *) malloc ( test_nnz * sizeof (int) ); 
+	test_val = (real *) malloc ( test_nnz * sizeof (real) ); 
+	test_vec = (real *) malloc ( max_test_row * sizeof (real) ); 
+
+	if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	rowPtr = test_row_id; 
+	colPtr = test_col_id; 
+	valPtr = test_val; 	
+	labelPtr = test_vec; 
+
+	numLines = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		cols[0] = cols[1] = cols[2] = 0; 
+
+		fgets( line, MAX_LINE, dataset_file);
+		rowNNZ = tokenize_binary_populate( line, 0, rowPtr, colPtr, valPtr, labelPtr, numLines); 
+		rowPtr += rowNNZ; 
+		colPtr += rowNNZ; 
+		valPtr += rowNNZ; 
+	
+		if (line[0] == 0) break;
+
+		numLines ++;
+	}
+	fclose( dataset_file ); 
+	for (int i = 0; i < numLines; i ++) 
+		if (test_vec[i] == -1) test_vec[i] = 2; 
+
+	fprintf( stderr, "Done populating the testing part ... \n"); 
+
+	//form the cuSparseMatrix Here. 
+	data->trainRowPtr = train_row_id; 
+	data->trainColPtr = train_col_id; 
+	data->trainValPtr = train_val; 
+	data->trainLabels = train_vec; 
+
+	data->testRowPtr = test_row_id; 
+	data->testColPtr = test_col_id; 
+	data->testValPtr = test_val; 
+	data->testLabels = test_vec; 
+
+	data->numclasses = 1; 
+	data->rows = max_test_row + max_train_row; 
+	data->trainSize = max_train_row; 
+	data->testSize = max_test_row; 
+
+	data->trainNNZ = train_nnz; 
+	data->testNNZ = test_nnz;
+
+	data->trainSet = NULL; 
+	data->testSet = NULL; 
+
+	if (max_train_col < max_test_col )
+		data->cols = max_test_col; 
+	else 
+		data->cols = max_train_col; 
+
+	data->trainSet = NULL; 
+	data->testSet = NULL; 
+
+	free(heapLine );
+}
+
+void readNewsgroupsDataset( char *f_train_features, char *f_train_labels, 
+				char *f_test_features, char *f_test_labels, 
+				ForestDataset *data, SCRATCH_AREA *s, int offset)
+{
+	FILE *dataset_file; 
+	char line[MAX_LINE]; 
+	int numLines = 0;
+	int NUM_CLASSES = 20;
+	size_t output; 
+	int idx = 0; 
+	int i;
+	real cols[3]; 
+	int max_train_col, max_test_col; 
+	int max_train_row, max_test_row; 
+
+	char filename[MAX_LINE]; 
+
+	real *scratch = s->hostWorkspace;
+
+	int *train_row_id, *train_col_id; 
+	int *test_row_id, *test_col_id; 
+
+	real *train_val, *train_vec; 
+	real *test_val, *test_vec; 
+	
+	int train_nnz, test_nnz; 
+
+	
+	if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	max_train_row = max_train_col = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		cols[0] = cols[1] = cols[2] = 0; 
+
+		fgets( line, MAX_LINE, dataset_file);
+		if (line[0] == 0) break;
+
+		tokenize_string( line, cols, 0 ); 
+
+		if (max_train_row < cols[0]) max_train_row = cols[0]; 
+		if (max_train_col < cols[1]) max_train_col = cols[1]; 
+		
+		numLines ++;
+	}
+	train_nnz = numLines; 
+	fclose( dataset_file ); 
+	fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d) \n", 
+			numLines, max_train_row, max_train_col ); 
+
+
+	if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	max_test_row = max_test_col = numLines = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		cols[0] = cols[1] = cols[2] = 0; 
+
+		fgets( line, MAX_LINE, dataset_file);
+		tokenize_string( line, cols, 0 ); 
+
+		if (max_test_row < cols[0]) max_test_row = cols[0]; 
+		if (max_test_col < cols[1]) max_test_col = cols[1]; 
+		
+		if (line[0] == 0) break;
+		numLines ++;
+	}
+	test_nnz = numLines; 
+	fclose( dataset_file ); 
+
+	fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d) \n", 
+			numLines, max_test_row, max_test_col ); 
+
+	if (max_train_col < max_test_col ){
+		fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_test_col );
+		fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_test_col );
+	} else {
+		fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_train_col );
+		fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_train_col );
+	}
+
+	//Read the matrices Here. 
+	train_row_id = (int *) malloc ( train_nnz * sizeof (int) ); 
+	train_col_id = (int *) malloc ( train_nnz * sizeof (int) ); 
+	train_val = (real *) malloc ( train_nnz * sizeof (real) ); 
+	train_vec = (real *) malloc ( max_train_row * sizeof (real) ); 
+
+	if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	numLines = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		cols[0] = cols[1] = cols[2] = 0; 
+
+		fgets( line, MAX_LINE, dataset_file);
+		if (line[0] == 0) break;
+
+		tokenize_string( line, cols, 0 ); 
+
+		train_row_id[ numLines ] = (int)(cols[0] - 1); 
+		train_col_id[ numLines ] = (int)(cols[1] - 1); 
+		train_val[ numLines ] = (real)cols[2]; 
+
+		//fprintf( stderr, " %d, %d, %f \n", train_row_id[ numLines ], train_col_id[ numLines ], train_val [numLines ] ); 
+
+		numLines ++;
+	}
+	fclose( dataset_file ); 
+
+	//vector here. 
+	i = readVector( train_vec, max_train_row, f_train_labels, offset ); 
+	fprintf( stderr, "Labels read from file: %d, expected : %d \n", i, max_train_row ); 
+
+	//compute the NUM_CLASSES Here. 
+	NUM_CLASSES = computeMaxValue( train_vec, max_train_row); 
+
+	//Read the test dataset here. 
+	test_row_id = (int *) malloc ( test_nnz * sizeof (int) ); 
+	test_col_id = (int *) malloc ( test_nnz * sizeof (int) ); 
+	test_val = (real *) malloc ( test_nnz * sizeof (real) ); 
+	test_vec = (real *) malloc ( max_test_row * sizeof (real) ); 
+
+	if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	numLines = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		cols[0] = cols[1] = cols[2] = 0; 
+
+		fgets( line, MAX_LINE, dataset_file);
+		tokenize_string( line, cols, 0 ); 
+
+		if (line[0] == 0) break;
+		test_row_id[ numLines ] = (int)(cols[0] - 1); 
+		test_col_id[ numLines ] = (int)(cols[1] - 1); 
+		test_val[ numLines ] = (real)cols[2]; 
+
+		numLines ++;
+	}
+	fclose( dataset_file ); 
+
+	//vector here. 
+	i = readVector( test_vec, max_test_row, f_test_labels, offset ); 
+
+	//form the cuSparseMatrix Here. 
+	data->trainRowPtr = train_row_id; 
+	data->trainColPtr = train_col_id; 
+	data->trainValPtr = train_val; 
+	data->trainLabels = train_vec; 
+
+	data->testRowPtr = test_row_id; 
+	data->testColPtr = test_col_id; 
+	data->testValPtr = test_val; 
+	data->testLabels = test_vec; 
+
+	data->numclasses = NUM_CLASSES - 1; 
+	data->rows = max_test_row + max_train_row; 
+	data->trainSize = max_train_row; 
+	data->testSize = max_test_row; 
+
+	data->trainNNZ = train_nnz; 
+	data->testNNZ = test_nnz;
+
+	data->trainSet = NULL; 
+	data->testSet = NULL; 
+
+	if (max_train_col < max_test_col )
+		data->cols = max_test_col; 
+	else 
+		data->cols = max_train_col; 
+
+	data->trainSet = NULL; 
+	data->testSet = NULL; 
+
+	// preprocess the dataset here. 
+	/*
+	real train_max = findMaxInDataset( data->trainValPtr, data->trainNNZ, 1 ); 
+	real test_max = findMaxInDataset( data->testValPtr, data->testNNZ, 1 ); 
+	fprintf( stderr, "Train max: %f, Test max: %f \n", train_max, test_max ); 
+
+	if (train_max < test_max){
+		preprocessDataset ( data->trainValPtr, data->trainNNZ, 1, test_max ); 
+		preprocessDataset ( data->testValPtr, data->testNNZ, 1, test_max ); 
+	} else {
+		preprocessDataset ( data->trainValPtr, data->trainNNZ, 1, train_max ); 
+		preprocessDataset ( data->testValPtr, data->testNNZ, 1, train_max ); 
+	}
+	*/
+}
+
+void readCIFARDataset( char *dir, char *train, char *test, ForestDataset *data, SCRATCH_AREA *s, int raw) {
+
+	FILE *dataset_file; 
+	char line[MAX_LINE]; 
+	int numLines = 0;
+	int NUM_CLASSES = 10;
+	size_t output; 
+	int idx = 0; 
+	int i;
+	int TRAIN_IMAGES = 50000; 
+	int TRAIN_FILES = 5; 
+
+	char filename[MAX_LINE]; 
+	real *train_set, *train_labels, *test_set, *test_labels;
+	real *scratch = s->hostWorkspace;
+
+	train_set = (real *) malloc( (size_t)TRAIN_IMAGES * (CIFAR_LINE_SIZE-1) * sizeof(real) );
+	train_labels = (real *) malloc ( (size_t)TRAIN_IMAGES * sizeof(real) ); 
+	test_set = (real *) malloc( (size_t)10000 * (CIFAR_LINE_SIZE-1) * sizeof(real) );
+	test_labels = (real *) malloc ( (size_t)10000 * sizeof(real) ); 
+
+	fprintf( stderr, " Allocated memory for the dataset : %lu \n", TRAIN_IMAGES * (CIFAR_LINE_SIZE-1) * sizeof(real)); 
+	fprintf( stderr, " Allocated memory for the dataset (GB): %d \n", (TRAIN_IMAGES * (CIFAR_LINE_SIZE-1) * sizeof(real)) / (1024 * 1024 * 1024)); 
+
+	numLines = 0; 
+	for (idx = 1; idx <= TRAIN_FILES; idx ++) {	
+		sprintf( filename, "%s%s%d.bin", dir, train, idx); 
+		fprintf( stderr, "Reading file : %s \n", filename ); 
+
+		if ( (dataset_file = fopen(filename, "r")) == NULL ) { 
+			fprintf( stderr, "Error opening the dataset.... !\n" );
+			exit( -1 );
+		}
+
+		while (!feof( dataset_file) ){
+			memset( line, 0, MAX_LINE );
+			output = fread( line, (size_t)1, (size_t)CIFAR_LINE_SIZE, dataset_file);
+
+			if (output <= 0) break;
+
+			train_labels[ numLines ] = line[0] + 1;  
+			for (i = 0; i < CIFAR_LINE_SIZE-1; i ++) 
+				train_set[ numLines * (CIFAR_LINE_SIZE - 1) + i ] = (unsigned char) line[i + 1]; 
+
+			numLines ++;
+		}
+	}
+	fprintf( stderr, "Done with reading %d points from the input files .... \n", numLines ); 
+
+	//test data here. 
+	numLines = 0; 
+	memset( filename, 0, MAX_LINE ); 
+	sprintf( filename, "%s%s", dir, test); 
+
+	if ( (dataset_file = fopen(filename, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		output = fread( line, (size_t)1, (size_t)CIFAR_LINE_SIZE, dataset_file);
+		if (output <= 0) break;
+
+		test_labels[ numLines ] = line[0] + 1; 
+		for (i = 0; i < CIFAR_LINE_SIZE - 1; i ++) 
+			test_set[ numLines * (CIFAR_LINE_SIZE - 1) + i ] = (unsigned char) line[i + 1]; 
+
+		numLines ++;
+	}
+	fprintf( stderr, "Done with reading %d points from the input files .... \n", numLines ); 
+
+	//inititalize the device data here. 
+	data->trainSize = TRAIN_IMAGES; 
+	data->testSize = 10000; 
+	data->trainSet = train_set;
+	data->trainLabels = train_labels;
+	data->testSet = test_set; 
+	data->testLabels = test_labels;
+	data->rows = data->trainSize + data->testSize;
+	data->cols = CIFAR_LINE_SIZE - 1;
+	data->numclasses = NUM_CLASSES - 1; 
+
+	data->trainRowPtr = NULL; 
+	data->trainColPtr = NULL; 
+	data->trainValPtr = NULL; 
+
+	data->testRowPtr = NULL; 
+	data->testColPtr = NULL; 
+	data->testValPtr = NULL; 
+
+/*
+	fprintf(stderr, "Preprocessing .... \n"); 
+	real train_max = findMaxInDataset( train_set, data->trainSize, data->cols ); 
+	real test_max = findMaxInDataset( test_set, data->testSize, data->cols ); 
+	fprintf( stderr, "TrainMax %e and TestMax: %e \n", train_max, test_max ); 
+
+	if (train_max >= test_max) {
+		preprocessDataset( train_set, data->trainSize, data->cols, train_max );
+		preprocessDataset( test_set, data->testSize, data->cols, train_max );
+	} else {
+		preprocessDataset( train_set, data->trainSize, data->cols, test_max );
+		preprocessDataset( test_set, data->testSize, data->cols, test_max );
+	}
+*/
+
+	fprintf( stderr, "Converting to column major format here.... \n"); 
+	//train_features
+	convertToColumnMajor( train_set, data->trainSize, data->cols, scratch);
+	fprintf( stderr, "Done with conversion... \n"); 
+	memcpy( train_set, scratch, (size_t)(sizeof(real) * data->trainSize * data->cols) );
+
+	//test_features
+	convertToColumnMajor( test_set, data->testSize, data->cols, scratch);
+	fprintf( stderr, "Done with conversion... \n"); 
+	memcpy( test_set, scratch, (size_t)(sizeof(real) * data->testSize * data->cols) );
+
+        if (raw == 0){
+                fprintf( stderr, "Normalizing the data ... ");
+                columnNormalize( train_set, data->trainSize, data->cols, test_set, data->testSize );
+                fprintf( stderr, "Done... \n");
+        }
+
+}
+
+void readMultiDataset( char *f_train_features, char *f_train_labels, 
+		char *f_test_features, char *f_test_labels, ForestDataset *data, SCRATCH_AREA *s, int offset, int bias)
+{
+	FILE *dataset_file;
+	char line[MAX_LINE];
+	int numLines = 0;
+	real temp[MAX_LINE]; 
+	int NUM_CLASSES = -1;
+
+	real *train_set, *train_labels, *test_set, *test_labels;
+	real *scratch = s->hostWorkspace;
+
+	if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, dataset_file);
+
+		if (line[0] == 0) break;
+		//data->cols = tokenize_learn_multiclass( line, temp, numLines, NULL, NULL);
+		data->cols = tokenize_string( line, temp, bias);
+		numLines ++;
+	}
+
+	fprintf(stderr, "Number of columns is : %d \n", data->cols ); 
+	fprintf( stderr, "Train Size: %d \n", numLines ); 
+
+	//exit (-1); 
+
+	data->trainSize = numLines;
+	/*
+	train_set = (real *)malloc( (FEATURE_SIZE_MULTI) * data->trainSize);
+	train_labels = (real *)malloc(sizeof(real) * data->trainSize);
+	*/
+	train_set = (real *)malloc(  data->cols * data->trainSize * sizeof(real));
+	train_labels = (real *)malloc( data->trainSize * sizeof(real));
+
+	//read the file here and fill the matrix. 
+	rewind( dataset_file );	
+	numLines = 0;
+
+	while (!feof( dataset_file )){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, dataset_file);
+		if (line[0] == 0) break;
+		tokenize_populate( line, train_set, &numLines, data->cols, bias );
+		numLines ++;
+	}
+	fclose( dataset_file );
+
+	//read the train labels here. 
+	fprintf( stderr, " Reading the vector: %s \n", f_train_labels ); 
+	readVector( train_labels, data->trainSize, f_train_labels, offset ); 
+
+	//compute the NUM_CLASSES Here. 
+	NUM_CLASSES = computeMaxValue( train_labels, data->trainSize ); 
+
+	//read the test dataset here. 
+	fprintf( stderr, " Reading the test Matrix: %s \n", f_test_features ); 
+	if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the dataset.... !\n" );
+		exit( -1 );
+	}
+
+	numLines = 0; 
+	while (!feof( dataset_file) ){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, dataset_file);
+
+		if (line[0] == 0) break;
+		//data->cols = tokenize_learn_multiclass( line, temp, numLines, NULL, NULL);
+		data->cols = tokenize_string( line, temp, bias );
+		numLines ++;
+	}
+
+	fprintf(stderr, "Test size: %d \n", numLines ); 
+	fprintf( stderr, "Number of features for test set: %d \n", data->cols ); 
+
+	data->testSize = numLines;
+	/*
+	test_set = (real *)malloc( (FEATURE_SIZE_MULTI) * data->testSize);
+	test_labels = (real *)malloc(sizeof(real) * data->testSize);
+	*/
+	test_set = (real *)malloc( data->cols * data->testSize *  sizeof(real));
+	test_labels = (real *)malloc(data->testSize *  sizeof(real));
+
+	//read the test set
+	rewind( dataset_file );	
+	numLines = 0;
+
+	while (!feof( dataset_file )){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, dataset_file);
+		if (line[0] == 0) break;
+		tokenize_populate( line, test_set, &numLines, data->cols, bias );
+		numLines ++;
+	}
+	fclose( dataset_file );
+
+	//read the test labels here. 
+	readVector( test_labels, numLines, f_test_labels, offset ); 
+	real testMax = computeMaxValue( test_labels, numLines );
+	if (testMax > NUM_CLASSES) 
+		NUM_CLASSES = (int) testMax; 
+		
+
+	//initialization here. 
+	data->trainSet = train_set;
+	data->trainLabels = train_labels;
+	data->testSet = test_set; 
+	data->testLabels = test_labels;
+	data->rows = data->trainSize + data->testSize;
+	data->numclasses = NUM_CLASSES - 1; 
+
+	data->trainRowPtr = NULL; 
+	data->trainColPtr = NULL; 
+	data->trainValPtr = NULL; 
+
+	data->testRowPtr = NULL; 
+	data->testColPtr = NULL; 
+	data->testValPtr = NULL; 
+
+	//preprocessing step here. 
+	/*
+	real train_max = findMaxInDataset( train_set, data->trainSize, data->cols ); 
+	real test_max = findMaxInDataset( test_set, data->testSize, data->cols ); 
+
+	if (train_max >= test_max) {
+		preprocessDataset( train_set, data->trainSize, data->cols, train_max );
+		preprocessDataset( test_set, data->testSize, data->cols, train_max );
+	} else {
+		preprocessDataset( train_set, data->trainSize, data->cols, test_max );
+		preprocessDataset( test_set, data->testSize, data->cols, test_max );
+	}
+	*/	
+
+	//train_features
+	convertToColumnMajor( train_set, data->trainSize, data->cols, scratch);
+	memcpy( train_set, scratch, sizeof(real) * data->trainSize * data->cols );
+
+	//test_features
+	convertToColumnMajor( test_set, data->testSize, data->cols, scratch);
+	memcpy( test_set, scratch, sizeof(real) * data->testSize * data->cols );
+
+	//DEBUG HERE. 
+	/*
+	fprintf (stderr, "Train Set Here \n"); 
+	for (int i = 0; i < data->trainSize; i ++) 
+		fprintf( stderr, " %2.2f ", train_set[ i ] ); 
+	fprintf( stderr, "\n"); 
+
+	fprintf( stderr, "Labels here \n"); 
+	for (int i = 0; i < data->trainSize; i ++)
+		fprintf( stderr, " %2.2f ", train_labels[ i ] ); 
+	fprintf (stderr, "\n"); 
+	*/	
+}
+
+int tokenize_binary_populate( char *line, int bias, int *row, int *col, real *val, real *label, int rowNum )
+{
+	const char *sep = ", \n";
+	char *word, *ptr;
+	char temp[MAX_LINE];
+	int index = 0; 
+	int len = 0; 
+
+	char col_str[32]; 
+
+	if (bias >= 1){ 
+		*row = rowNum; row ++; 
+		*col = 0; col ++;
+		*val = 1; val ++; 
+
+		index = 1;
+	}
+
+	strncpy( temp, line, MAX_LINE );
+	for( word = strtok(temp, sep); word; word = strtok(NULL, sep) )
+	{ 
+		memset( col_str, 0, sizeof(char) * 32);
+		memcpy( col_str, word, 31); 
+		len = 0; 
+
+		ptr = col_str; 
+		while (*ptr != 0 && *ptr != ':'){ 
+			ptr ++; 
+			len ++; 
+		}
+
+		if (*ptr == ':') {
+			*ptr = 0; 
+
+			*row = rowNum; row ++; 
+			*col = atoi( col_str) - 1;  col ++; 
+			*val = atof( col_str + len + 1);  val ++; 
+			index ++; 
+
+		} else {
+			label[rowNum] = atof( word ); 
+		}
+	}
+
+	return index;
+}
+
+int tokenize_binary_string( char *line, int bias, int *nnz)
+{
+	const char *sep = ", \n";
+	char *word, *ptr;
+	//char temp[MAX_LINE];
+	int index = 0; 
+	int col = 0; 
+	real val = 0; 
+	int len = 0; 
+
+	char col_str[32];
+
+	if (bias >= 1) index = 1;
+
+	for( word = strtok(line, sep); word; word = strtok(NULL, sep) )
+	{ 
+		col = val = -99; 
+		memset( col_str, 0, 32);
+
+		strncpy( col_str, word, 31 ); 
+		ptr = col_str; 
+
+		len = 0; 
+		while (*ptr != 0 && *ptr != ':'){ 
+			ptr ++; 
+			len ++; 
+		}
+
+		if (*ptr == ':') {
+			*ptr = 0; 
+			col = atoi( col_str ); // to account for zero here.
+			val = atof( col_str + len + 1 ); 
+
+			(*nnz) ++; 
+		}
+	}
+
+	return col;
+}
+
+
+int tokenize_string( char *line, real *out, int bias )
+{
+	const char *sep = ", \n";
+	char *word;
+	char temp[MAX_LINE];
+	int index = 0; 
+
+	if (bias >= 1) index = 1;
+
+	strncpy( temp, line, MAX_LINE );
+	for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) out[ index ++] = atof( word );
+
+	return index;
+}
+
+void tokenize_populate(char *line, real *train_set, int *count, int size, int bias){
+
+	const char *sep = ", \n";
+	char *word;
+	char temp[MAX_LINE];
+	int index = 0; 
+	real cur_row[MAX_LINE]; 
+
+	if (bias >= 1) cur_row[ index ++ ] = 1;
+
+	strncpy( temp, line, MAX_LINE );
+	for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) cur_row[ index ++] = atof( word );
+	memcpy( &train_set[ (*count) * (size)], cur_row, sizeof(real) * size);
+}
+
+
+void printDataset( ForestDataset *t)
+{
+	fprintf( stderr, "--------------------");
+	fprintf( stderr, "Train Row 1: ");
+	for (int i = 0; i < 52; i ++)
+		fprintf( stderr, " %f ", t->trainSet[i] );
+	fprintf( stderr, "\n");
+	fprintf( stderr, "Test Row 1: ");
+	for (int i = 0; i < 52; i ++)
+		fprintf( stderr, " %f ", t->testSet[i] );
+	fprintf( stderr, "\n");
+
+	fprintf( stderr, "Train Labels: \n");
+	for (int i = 0; i < t->rows; i ++)
+		fprintf (stderr, " %f ", t->trainLabels[i] );
+	fprintf( stderr, "\n");
+
+	fprintf( stderr, "Test Labels: \n");
+	for (int i = 0; i < 200; i ++)
+		fprintf (stderr, " %f ", t->testLabels[i] );
+	fprintf( stderr, "\n");
+	fprintf( stderr, "--------------------\n");
+}
+
+//
+//
+// Device Functions here. 
+//
+//
+void initialize_device_data( ForestDataset *s, DeviceDataset *t)
+{
+	t->rows = s->trainSize;
+	t->cols = s->cols;
+	t->testSize = s->testSize;
+	t->numclasses = s->numclasses; 
+
+	cuda_malloc( (void **)&t->trainSet, t->rows * t->cols * sizeof(real), 0, ERROR_MEM_ALLOC );
+	copy_host_device( s->trainSet, t->trainSet, t->rows * t->cols * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET );
+
+	cuda_malloc( (void **)&t->trainLabels, t->rows  * sizeof(real), 0, ERROR_MEM_ALLOC );
+	copy_host_device( s->trainLabels, t->trainLabels, t->rows * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINLABELS );
+	
+	cuda_malloc( (void **)&t->testSet, t->testSize * t->cols * sizeof(real), 0, ERROR_MEM_ALLOC );
+	copy_host_device( s->testSet, t->testSet, t->testSize * t->cols * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TESTSET );
+
+	cuda_malloc( (void **)&t->testLabels, t->testSize  * sizeof(real), 0, ERROR_MEM_ALLOC );
+	copy_host_device( s->testLabels, t->testLabels, t->testSize * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TESTLABELS );
+
+	if (t->numclasses > 1)
+		cuda_malloc( (void **)&t->weights, t->numclasses * t->cols * sizeof(real), 1, ERROR_MEM_ALLOC );
+	else
+		cuda_malloc( (void **)&t->weights, t->cols * sizeof(real), 1, ERROR_MEM_ALLOC );
+
+#ifdef __debug__
+	fprintf (stderr, " -------------- \n");
+	fprintf( stderr, "Train Set size: %d %d, %d \n", t->rows, t->cols, t->testSize );
+	fprintf (stderr, " -------------- \n");
+#endif
+
+	t->spTrain.rowPtr = NULL; 
+	t->spTrain.colPtr = NULL; 
+	t->spTrain.valPtr = NULL; 
+	t->spTrain.rowCsrPtr = NULL; 
+
+	t->spTest.rowPtr = NULL; 
+	t->spTest.colPtr = NULL; 
+	t->spTest.valPtr = NULL; 
+	t->spTest.rowCsrPtr = NULL; 
+
+	//printVector (t->testSet, t->testSize, NULL);
+	//printVector( t->trainLabels, t->rows, s->trainLabels );
+	
+
+	//sub sampling here. 
+	//Hesian part here
+	t->spSampledHessianTrain.nnz = 0; 
+	t->spSampledHessianTrain.P = NULL; 
+	t->spSampledHessianTrain.sortedVals= NULL; 
+	t->spSampledHessianTrain.rowPtr= NULL;
+	t->spSampledHessianTrain.colPtr= NULL; 
+	t->spSampledHessianTrain.valPtr= NULL; 
+	t->spSampledHessianTrain.rowCsrPtr= NULL; 
+
+	t->hessianSampleSize = (SAMPLING_BUFFER_EXTENSION * HESSIAN_SAMPLING_SIZE * t->rows) / 100;
+	t->spHessianSample.nnz = t->hessianSampleSize; 
+	cuda_malloc( (void **)&t->sampledHessianTrainSet, t->hessianSampleSize* t->cols * sizeof(real), 0, ERROR_MEM_ALLOC );
+	//fprintf( stderr, "SubSampled Size for this dataset (Hessian): %d \n", t->hessianSampleSize); 
+
+	//spSampledHessian
+	cuda_malloc( (void **) &t->spHessianSample.P, 		t->hessianSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.sortedVals,  t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.rowPtr, 	t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.colPtr, 	t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.valPtr, 	t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.rowCsrPtr,   (t->hessianSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+
+	//Gradient Sample Here. 
+	t->spSampledGradientTrain.nnz = 0; 
+	t->spSampledGradientTrain.P = NULL; 
+	t->spSampledGradientTrain.sortedVals = NULL; 
+	t->spSampledGradientTrain.rowPtr = NULL; 
+	t->spSampledGradientTrain.colPtr = NULL; 
+	t->spSampledGradientTrain.valPtr = NULL; 
+	t->spSampledGradientTrain.rowCsrPtr = NULL; 
+
+	t->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * t->rows ) / 100; 
+	t->spGradientSample.nnz = t->gradientSampleSize; 
+	cuda_malloc( (void **)&t->sampledGradientTrainSet, t->gradientSampleSize* t->cols * sizeof(real), 0, ERROR_MEM_ALLOC );
+	cuda_malloc( (void **)&t->sampledGradientTrainLabels, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC );
+	fprintf( stderr, "SubSampled Size for this dataset (Gradient): %d \n", t->gradientSampleSize); 
+
+	//spSampledHessian
+	cuda_malloc( (void **) &t->spGradientSample.P, 		t->gradientSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.sortedVals,  t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.rowPtr, 	t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.colPtr, 	t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.valPtr, 	t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.rowCsrPtr,   (t->gradientSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+
+		
+}
+
+void initialize_device_data_sparse( ForestDataset *s, DeviceDataset *t )
+{
+
+	t->trainSet = NULL; 
+	t->testSet = NULL; 
+	
+	t->rows = s->trainSize;
+	t->cols = s->cols;
+	t->testSize = s->testSize;
+	t->numclasses = s->numclasses; 
+
+	//t->trainNNZ = s->trainNNZ; 
+	//t->testNNZ = s->testNNZ; 
+	t->spTrain.nnz = s->trainNNZ; 
+	t->spTest.nnz = s->testNNZ; 
+
+	fprintf( stderr, "NNZ: %d, %d \n", s->trainNNZ, s->testNNZ ); 
+
+	//Train Set Here. 
+	cuda_malloc( (void **) &t->spTrain.rowPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->trainRowPtr, t->spTrain.rowPtr, s->trainNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->spTrain.colPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->trainColPtr, t->spTrain.colPtr, s->trainNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->spTrain.valPtr, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->trainValPtr, t->spTrain.valPtr, s->trainNNZ * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->trainLabels, t->rows * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->trainLabels, t->trainLabels, t->rows* sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->spTrain.rowCsrPtr, (t->rows + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+
+	//allocate the csc format matrix space here. 
+	//cuda_malloc( (void **) &t->spTrain.cscRowPtr, s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	//cuda_malloc( (void **) &t->spTrain.cscColPtr, (t->cols + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	//cuda_malloc( (void **) &t->spTrain.cscValPtr, s->trainNNZ * sizeof(double), 1, ERROR_MEM_ALLOC ); 
+
+	//allocate the data for sorted coo format here. 
+	cuda_malloc( (void **) &t->spTrain.P, s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spTrain.sortedVals, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+
+	
+
+	fprintf( stderr, "Done with training .... \n"); 
+
+	//TestSet Here. 
+	cuda_malloc( (void **) &t->spTest.rowPtr, s->testNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->testRowPtr, t->spTest.rowPtr, s->testNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->spTest.colPtr, s->testNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->testColPtr, t->spTest.colPtr, s->testNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->spTest.valPtr, s->testNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->testValPtr, t->spTest.valPtr, s->testNNZ * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->testLabels, t->testSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	copy_host_device( s->testLabels, t->testLabels, t->testSize * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); 
+
+	cuda_malloc( (void **) &t->spTest.rowCsrPtr, (t->testSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+
+	//allocate the data for sorted coo format here. 
+	cuda_malloc( (void **) &t->spTest.P, s->testNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spTest.sortedVals, s->testNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+
+	fprintf( stderr, "Done with testing .... \n"); 
+
+	//Weights Here. 
+	if (t->numclasses > 1)
+		cuda_malloc( (void **)&t->weights, t->numclasses * t->cols * sizeof(real), 1, ERROR_MEM_ALLOC );
+	else
+		cuda_malloc( (void **)&t->weights, t->cols * sizeof(real), 1, ERROR_MEM_ALLOC );
+
+	//sparse sample matrices here. 
+	//sub sampling here. 
+	//Hesian part here
+	t->sampledGradientTrainSet = NULL; 
+	t->sampledHessianTrainSet = NULL; 
+	t->hessianSampleSize = (SAMPLING_BUFFER_EXTENSION * HESSIAN_SAMPLING_SIZE * t->rows) / 100;
+	t->spHessianSample.nnz = t->hessianSampleSize; 
+
+	//spSampledHessian
+	cuda_malloc( (void **) &t->spHessianSample.P, 		t->hessianSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.sortedVals,  t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.rowPtr, 	t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.colPtr, 	t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.valPtr, 	t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spHessianSample.rowCsrPtr,   (t->hessianSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+
+	//Gradient Sample Here. 
+	t->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * t->rows ) / 100; 
+	t->spGradientSample.nnz = t->gradientSampleSize; 
+	cuda_malloc( (void **)&t->sampledGradientTrainLabels, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC );
+
+	//spSampledHessian
+	cuda_malloc( (void **) &t->spGradientSample.P, 		t->gradientSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.sortedVals,  t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.rowPtr, 	t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.colPtr, 	t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.valPtr, 	t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spGradientSample.rowCsrPtr,   (t->gradientSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+
+	
+	cuda_malloc( (void **) &t->spSampledGradientTrain.rowPtr, 	s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledGradientTrain.colPtr, 	s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledGradientTrain.valPtr, 	s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledGradientTrain.rowCsrPtr, 	(t->rows+ 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledGradientTrain.P, 		s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledGradientTrain.sortedVals, 	s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+
+	cuda_malloc( (void **) &t->spSampledHessianTrain.rowPtr, 	s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledHessianTrain.colPtr, 	s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledHessianTrain.valPtr, 	s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledHessianTrain.rowCsrPtr, 	(t->rows+ 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledHessianTrain.P, 		s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); 
+	cuda_malloc( (void **) &t->spSampledHessianTrain.sortedVals, 	s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); 
+
+
+	//Debug print statements here. 
+	fprintf (stderr, " -------------- \n");
+	fprintf( stderr, "Train Set size: %d %d, %d \n", t->rows, t->cols, t->testSize );
+	fprintf (stderr, " -------------- \n");
+}
+
+void cleanup_dataset( ForestDataset *s, DeviceDataset *t){
+	if (s->trainSet) release_memory( (void **)&s->trainSet );
+	if (s->trainLabels ) release_memory( (void **)&s->trainLabels );
+	if (s->testSet ) release_memory( (void **)&s->testSet );
+	if (s->testLabels ) release_memory( (void **)&s->testLabels );
+
+	if (t->trainSet) cuda_free ( t->trainSet, ERROR_MEM_CLEANUP );
+	if (t->trainLabels ) cuda_free ( t->trainLabels, ERROR_MEM_CLEANUP );
+	if (t->testSet) cuda_free ( t->testSet, ERROR_MEM_CLEANUP );
+	if (t->testLabels) cuda_free ( t->testLabels, ERROR_MEM_CLEANUP );
+
+	//sparse functions here. 
+	if (t->spTrain.rowPtr || t->spTrain.colPtr || t->spTrain.valPtr)
+		cusparseDestroyMatDescr( t->spTrain.descr ); 
+	if (t->spTrain.rowPtr) cuda_free( t->spTrain.rowPtr, ERROR_MEM_CLEANUP ); 
+	if (t->spTrain.colPtr) cuda_free( t->spTrain.colPtr, ERROR_MEM_CLEANUP ); 
+	if (t->spTrain.valPtr) cuda_free( t->spTrain.valPtr, ERROR_MEM_CLEANUP ); 
+	if (t->spTrain.rowCsrPtr) cuda_free( t->spTrain.rowCsrPtr, ERROR_MEM_CLEANUP ); 
+
+	if (t->spTest.rowPtr || t->spTest.colPtr || t->spTest.valPtr)
+		cusparseDestroyMatDescr( t->spTest.descr ); 
+	if (t->spTest.rowPtr) cuda_free( t->spTest.rowPtr, ERROR_MEM_CLEANUP ); 
+	if (t->spTest.colPtr) cuda_free( t->spTest.colPtr, ERROR_MEM_CLEANUP ); 
+	if (t->spTest.valPtr) cuda_free( t->spTest.valPtr, ERROR_MEM_CLEANUP ); 
+	if (t->spTest.rowCsrPtr) cuda_free( t->spTest.rowCsrPtr, ERROR_MEM_CLEANUP ); 
+}
diff --git a/code/cuda/RC-FINAL-5/dataset.h b/code/cuda/RC-FINAL-5/dataset.h
new file mode 100644
index 0000000..d22193d
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/dataset.h
@@ -0,0 +1,106 @@
+#ifndef _H_DATASET__
+#define _H_DATASET__
+
+#include <cuda_types.h>
+
+typedef struct dataset{ 
+	real *trainSet;	
+	real *trainLabels;
+	real *testSet;
+	real *testLabels; 
+	int trainSize; 
+	int testSize; 
+
+	int rows; 
+	int cols;
+	int numclasses; 
+
+	int *trainRowPtr, *trainColPtr, *testRowPtr, *testColPtr; 
+	real *trainValPtr, *testValPtr; 
+	int trainNNZ, testNNZ; 
+} ForestDataset;
+
+typedef struct spData {
+	int *rowPtr, *colPtr, *rowCsrPtr; 
+	real *valPtr; 
+
+	int nnz; 
+
+	//int *cscRowPtr, *cscColPtr; 
+	//real *cscValPtr; 
+
+	real *sortedVals;
+	int *P; 
+
+	cusparseMatDescr_t descr; 	
+
+} SparseDataset; 
+
+typedef struct devDataSet{
+	real *trainSet; 
+	real *trainLabels;
+	real *testSet; 
+	real *testLabels; 
+
+	real *weights; 
+	int rows; 
+	int cols;
+
+	int testSize;
+
+	int numclasses; 
+
+	SparseDataset spTrain; 
+	SparseDataset spTest; 
+
+	//subsampling part here. 
+	real *sampledGradientTrainSet; 
+	real *sampledGradientTrainLabels; 
+	int gradientSampleSize; 
+	SparseDataset spGradientSample; 
+
+	real *sampledHessianTrainSet; 
+	int hessianSampleSize; 
+	SparseDataset spHessianSample; 
+
+	SparseDataset spSampledGradientTrain; 
+	SparseDataset spSampledHessianTrain; 
+	
+}DeviceDataset;
+
+typedef struct params{ 
+	real *sigma; 
+	real *mu; 
+}GAUSSIAN_PARAMS;
+
+void printDataset( ForestDataset *t );
+
+
+void readMultiDataset( char *f_train_features, char *f_train_labels,
+                char *f_test_features, char *f_test_labels, ForestDataset *data, SCRATCH_AREA *s, int offset, int bias);
+void readCIFARDataset( char *dir, char *train, char *test, ForestDataset *data, SCRATCH_AREA *s, int);
+void readNewsgroupsDataset( char *train_features, char *train_labels,
+                                char *test_features, char *test_labels,
+                                ForestDataset *data, SCRATCH_AREA *s, int offset);
+void readBinaryMatFile( char *train_features, char *train_labels,
+                                char *test_features, char *test_labels,
+                                ForestDataset *data, SCRATCH_AREA *s, int offset);
+
+//int tokenize_learn_multiclass( char *line, real* t, int curIndex, int *counters, int **idx);
+int tokenize_string( char *line, real *out, int bias );
+void tokenize_populate(char *line, real *train_set, int *count, int size, int bias);
+
+int tokenize_binary_string( char *line, int bias, int *nnz);
+int tokenize_binary_populate( char *line, int bias, int *row, int *col, real *val, real *label, int rowNum );
+
+
+void initialize_device_data( ForestDataset *s, DeviceDataset *t);
+void initialize_device_data_sparse( ForestDataset *s, DeviceDataset *t );
+void cleanup_dataset( ForestDataset *s, DeviceDataset *t);
+
+real findMaxInDataset( real *src, int rows, int cols );
+void preprocessDataset( real *src, int rows, int cols, real maxval);
+
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/gen_random.cu b/code/cuda/RC-FINAL-5/gen_random.cu
new file mode 100644
index 0000000..6a3a6bd
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/gen_random.cu
@@ -0,0 +1,75 @@
+#include "gen_random.h"
+
+#include "cuda_types.h"
+#include "cuda_utils.h"
+
+#include "time.h"
+
+void getRandomVector (int n, real *hostPtr, real *devPtr) {
+
+        curandGenerator_t gen ;
+        int m = n + n % 2;
+
+        /* Create pseudo - random number generator */
+        curandCheckError ( curandCreateGenerator (&gen , CURAND_RNG_PSEUDO_DEFAULT ) );
+
+        /* Set seed */
+        //curandCheckError ( curandSetPseudoRandomGeneratorSeed ( gen , 1234ULL )) ;
+        curandCheckError ( curandSetPseudoRandomGeneratorSeed ( gen , time(NULL) )) ;
+
+        /* Generate n floats on device */
+        //curandCheckError ( curandGenerateNormalDouble ( gen , devPtr , m, 0, 1.)) ;
+        curandCheckError ( curandGenerateUniformDouble ( gen , devPtr , m)) ;
+
+        /* Copy device memory to host */
+        //copy_host_device( hostPtr, devPtr, sizeof(real) * n, cudaMemcpyDeviceToHost,
+        //                        ERROR_MEMCPY_DEVICE_HOST );
+        /* Cleanup */
+        curandCheckError ( curandDestroyGenerator ( gen ) );
+}
+
+/*
+Random Shuffle Here. 
+https://stackoverflow.com/questions/15961119/how-to-create-a-random-permutation-of-an-array
+*/
+void randomShuffle( int *idx, int n)
+{
+	int j, temp; 
+	for (int i = n - 1; i >= 0; i --){
+		j = rand () % (i+1); 	
+
+		temp = idx[i]; 
+		idx[i] = idx[j]; 
+		idx[j] = temp;
+	}
+}
+
+
+/*
+Floyd's algorithm Here. 
+https://stackoverflow.com/questions/1608181/unique-random-numbers-in-an-integer-array-in-the-c-programming-language
+*/
+
+void genRandomVector( int *idx, int m, int n ) {
+
+	int in, im; 
+	int rn, rm; 	
+	im = 0; 
+
+	for (in = 0; in < n && im < m; ++in ){
+		rn = n - in; 
+		rm = m - im; 
+
+		if (rand () % rn < rm ){
+			idx[ im ++] = in + 1; 
+		}
+	}
+
+	if ( im != m ){
+		fprintf( stderr, "Failed to generate required number of random numbers ... "); 
+		exit (-1); 
+	}
+
+	randomShuffle( idx, m ); 
+}
+
diff --git a/code/cuda/RC-FINAL-5/gen_random.h b/code/cuda/RC-FINAL-5/gen_random.h
new file mode 100644
index 0000000..6182652
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/gen_random.h
@@ -0,0 +1,12 @@
+#ifndef __H_GEN_RANDOM__
+#define __H_GEN_RANDOM__
+
+#include "cuda_types.h"
+
+void getRandomVector (int n, real *hostPtr, real *devPtr);
+
+void randomShuffle( int *idx, int m ); 
+void genRandomVector( int *idx, int m, int n );
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/linesearch.c b/code/cuda/RC-FINAL-5/linesearch.c
new file mode 100644
index 0000000..a5e7835
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/linesearch.c
@@ -0,0 +1,69 @@
+#include "linesearch.h"
+#include "logistic_fn_indicator.h"
+#include "cuda_utils.h"
+#include "print_utils.h"
+
+#include "softmax_multiclass.h"
+
+real cg_linesearch (real *d, real *weights, real rho, real c, SparseDataset *spfeatures, real *features, real *target, 
+			real lambda, int rows, int cols, int numclasses, real *gk, real *xx, real *devPtr, real *hostPtr, real *pageLocked)
+{
+	real alphak = 1.; 
+	real temp;
+	real *fk = &pageLocked[0];
+	real *fk1 = &pageLocked[1];
+	real *nextPagePtr = pageLocked + 2;
+
+	real *x = devPtr; 
+	real *nextDevPtr = x + numclasses * cols;
+	int iterations = 0; 
+
+	cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, weights, 1, x, 1) );
+
+	/*
+	if (numclasses == 1)
+        	logistic_fn_indicator( features, spfeatures, target, x, lambda, rows, cols, fk, nextDevPtr, hostPtr);	
+	else 
+	*/
+		*fk = softmax_multiclass_fx (spfeatures, features, target, rows, cols, numclasses, x,
+                                lambda, nextDevPtr, hostPtr, nextPagePtr);
+//fprintf (stderr, "%e, %d, %d, %d\n", *fk, rows, cols, numclasses ); 
+
+	//xx = x;	
+	cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, x, 1, xx, 1) );
+
+	//x = x + alphak*d
+	cublasCheckError( cublasDaxpy( cublasHandle, numclasses * cols, &alphak, d, 1, x, 1) );
+
+	cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, d, 1, &temp )) ;
+	
+	/*
+	if (numclasses == 1)
+        	logistic_fn_indicator( features, spfeatures, target, x, lambda, rows, cols, fk1, nextDevPtr, hostPtr);	
+	else
+	*/
+		*fk1 = softmax_multiclass_fx (spfeatures, features, target, rows, cols, numclasses, x,
+                                lambda, nextDevPtr, hostPtr, nextPagePtr);
+//fprintf (stderr, "%e, %d, %d, %d\n", *fk1, rows, cols, numclasses ); 
+
+	cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, gk, 1, d, 1, &temp) );
+	while (((*fk1) > ((*fk) + c * alphak * temp)) && (iterations < 50)){
+		alphak *= rho;
+
+		cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, xx, 1, x, 1) );
+		cublasCheckError( cublasDaxpy( cublasHandle, numclasses * cols, &alphak, d, 1, x, 1) );
+
+		/*
+		if (numclasses == 1)
+        		logistic_fn_indicator( features, spfeatures, target, x, lambda, rows, cols, fk1, nextDevPtr, hostPtr);	
+		else 
+		*/
+			*fk1 = softmax_multiclass_fx (spfeatures, features, target, rows, cols, numclasses, x,
+                                lambda, nextDevPtr, hostPtr, nextPagePtr);
+
+		iterations ++; 
+//fprintf (stderr, "%e, %d, %d, %d\n", *fk1, rows, cols, numclasses ); 
+	}
+	//fprintf( stderr, "..... line search iterations.... %d ( %2.6e, %2.6e)  \n", iterations, *fk1, (*fk + c * alphak * temp) ); 
+	return alphak;
+}
diff --git a/code/cuda/RC-FINAL-5/linesearch.h b/code/cuda/RC-FINAL-5/linesearch.h
new file mode 100644
index 0000000..7c387cf
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/linesearch.h
@@ -0,0 +1,10 @@
+#ifndef __H_LINESEARCH__
+#define __H_LINESEARCH__
+
+#include <cuda_types.h>
+#include <dataset.h>
+
+real cg_linesearch (real *, real *, real , real , SparseDataset *, real *, real *, 
+                        real , int , int , int, real *, real *, real *, real *, real * );
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/logistic-driver.c b/code/cuda/RC-FINAL-5/logistic-driver.c
new file mode 100644
index 0000000..2490851
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/logistic-driver.c
@@ -0,0 +1,281 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dataset.h"
+#include "sparse_dataset.h"
+
+#include "cuda_environment.h"
+#include "newton_cg.h"
+#include "utils.h"
+#include "cuda_utils.h"
+#include "logistic_fn_indicator.h"
+
+#include "softmax_multiclass.h"
+
+cublasHandle_t cublasHandle;
+cusparseHandle_t cusparseHandle; 
+int BLOCKS, BLOCK_SIZE, BLOCKS_POW_2;
+void *dscratch;
+
+int main(int argc, char **argv){
+	
+	// Data variables.
+	ForestDataset forestData;
+	DeviceDataset devData;
+	SCRATCH_AREA	scratch;
+	NEWTON_CG_PARAMS params;
+
+	real trainingTime_s, classificationTime_s;
+	real trainingTime_t, classificationTime_t;
+	int test_case_no = 1;
+	int nConIterations;
+
+	int DATASET_TYPE = 1; 
+	double l = 1e-6; 
+	int max_cg_iterations = -1; 
+	double cg_tolerance = 0; 
+
+	if (argc <= 4) {
+		fprintf( stderr, "<exe> dataset lambda .... is the commnad \n"); 
+		exit (-1); 
+	}
+
+	DATASET_TYPE = atoi( argv[1] ); 
+	l = atof ( argv[2] ); 
+	max_cg_iterations = atoi (argv[3] ); 
+	cg_tolerance = atof( argv[4] ); 
+
+	// Create the CUDA Environment Here. 
+	// Memory and device settings here. 
+	cuda_env_init (&scratch);
+	#ifdef __debug__
+	fprintf( stderr, "Scratch Area initialized ... \n");
+	#endif
+
+
+		switch( DATASET_TYPE ) {
+		
+                case 1:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_features.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_labels.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_features.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_labels.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 11:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_features.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_labels.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_features.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_labels.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 2:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_vec.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 12:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_vec.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 3:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/test_vec.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 13:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/test_vec.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 4:
+                        readCIFARDataset(
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/",
+                                "data_batch_", "test_batch.bin",
+                                &forestData, &scratch, 1 );
+                        break;
+
+                case 14:
+                        readCIFARDataset(
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/",
+                                "data_batch_", "test_batch.bin",
+                                &forestData, &scratch, 0 );
+                        break;
+
+                case 5:
+                        readNewsgroupsDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/test_vec.txt",
+                                &forestData, &scratch );
+                        break;
+
+                case 15:
+                        readNewsgroupsDataset (
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_vec.txt",
+                                &forestData, &scratch );
+                        break;
+
+		//Logistic Datasets Here
+                case 6:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/test_vec.txt",
+                                &forestData, &scratch, 1, 1 );
+                        break;
+
+                case 7:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_vec.txt",
+                                &forestData, &scratch, 1, 1 );
+                        break;
+                case 8:
+                        readMultiDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/test_vec.txt",
+                                &forestData, &scratch, 1, 1 );
+                        break;
+
+		//Sparse Logistic Datasets Here
+                case 9:
+                        readNewsgroupsDataset(
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/test_vec.txt",
+                                &forestData, &scratch );
+                        break;
+                case 10:
+                        readNewsgroupsDataset (
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/train_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/train_vec.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/test_mat.txt",
+                                "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/test_vec.txt",
+                                &forestData, &scratch );
+                        break;
+
+		}
+	#ifdef __debug__
+		fprintf( stderr, "Done with initialization of the dataset .... \n");
+		fprintf( stderr, "Blocks for %d data points... \n", forestData.rows);
+	#endif
+
+        	compute_blocks (&BLOCKS, &BLOCK_SIZE, forestData.trainSize);
+        	compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2);
+		if (BLOCKS_POW_2 < 32) BLOCKS_POW_2 = 32;
+	#ifdef __debug__
+		fprintf ( stderr, "Blocks: %d, BlockSize: %d, Power_2: %d\n", BLOCKS, BLOCK_SIZE, BLOCKS_POW_2);
+	#endif
+
+
+		// Move the data to the Device. 
+		if ((forestData.trainSet == NULL) && (forestData.testSet == NULL))
+		{	
+			initialize_device_data_sparse( &forestData, &devData );
+			initMatDescriptors ( &devData ); 
+			convertToCSR ( &devData, scratch.devWorkspace ); 
+
+			initMatDescriptorsForSampling( &devData ); 
+			initMatDescriptorsForSparseSampling( &devData ); 
+		} else {
+			initialize_device_data( &forestData, &devData );
+			initMatDescriptorsForSampling( &devData ); 
+		}
+		
+	#ifdef __debug__
+		fprintf( stderr, "Inittialized the Device with the dataset ... \n");
+	#endif
+
+		//Train the dataset here. 
+		params.max_iterations = 10;
+		params.tolerance = 1e-5;
+		params.iflag = 0;
+
+		params.lambda = l;
+		params.max_cg_iterations = max_cg_iterations; 
+		params.cg_tolerance = cg_tolerance;
+
+		params.gx_sampling = 0; 
+		params.hx_sampling = 0;
+
+		fprintf( stderr, "Start of TestCase: %d\n", 	test_case_no);
+		trainingTime_s = Get_Time ();
+		nConIterations = newton_cg_multi_optimized( &forestData, &devData, &params, &scratch, params.gx_sampling);
+		trainingTime_t = Get_Timing_Info( trainingTime_s );	
+	#ifdef __debug__
+		fprintf( stderr, "Done with training .... \n");
+	#endif
+	
+		//exit (-1); 
+
+		//Predict the testing set here. 
+		real accuracy = 0;
+		classificationTime_s = Get_Time ();
+		accuracy = softmax_predict(&devData.spTest, devData.testSet, forestData.testLabels, devData.weights, devData.testSize, 
+				devData.cols, devData.numclasses, scratch.hostWorkspace, scratch.devWorkspace, 
+				1, forestData.testSet);
+		classificationTime_t = Get_Timing_Info( classificationTime_s );
+		//fprintf( stderr, "Start of TestCase: %d\n", 	test_case_no);
+		fprintf( stderr, "Dataset: %d \n", 		DATASET_TYPE ); 
+		//fprintf( stderr, "Column Selected : %d\n", 	col );
+		fprintf( stderr, "NumClasses: %d\n", 		devData.numclasses ); 
+		fprintf( stderr, "Lambda: %e\n", 		params.lambda );
+		fprintf( stderr, "NewtonIterations: %d\n", 	params.max_iterations);
+		fprintf( stderr, "NewtonTolerance: %e\n", 	params.tolerance);
+		fprintf( stderr, "CGIterations: %d\n", 		params.max_cg_iterations);
+		fprintf( stderr, "CGTolerance: %e\n", 		params.cg_tolerance );
+		fprintf( stderr, "DataSetSize: %d\n", 		forestData.rows );
+		//fprintf( stderr, "TrainingPer: %3.2f\n", 		d * 100.);
+		fprintf( stderr, "TrainingSize: %d\n", 		forestData.trainSize);
+		fprintf( stderr, "Features: %d\n", 		forestData.cols );
+		fprintf( stderr, "TrainingTime: %d\n", 		(unsigned int)(trainingTime_t * 1000)  );
+		fprintf( stderr, "TestingSize: %d\n", 		forestData.testSize );
+		fprintf( stderr, "ClassificationTime: %d\n", 	(unsigned int)(classificationTime_t*1000)  );
+		fprintf( stderr, "TestAccuracy:  %3.2f\n", 	accuracy );
+		fprintf( stderr, "NewtonIterationsCon: %d\n", 	nConIterations );
+		fprintf( stderr, "NewtonConvergence: %d\n", 	(int)params.iflag );
+		fprintf( stderr, "End of TestCase: %d\n", 	test_case_no);
+		fprintf( stderr, "\n\n\n");
+
+		//cleanup the dataset pointers here. 
+		cleanup_dataset(&forestData, &devData );
+
+		test_case_no ++;
+
+	//Cleanup host/device Here. 
+	cuda_env_cleanup(&scratch);
+
+	return 0;
+}
diff --git a/code/cuda/RC-FINAL-5/logistic_fn_indicator.cu b/code/cuda/RC-FINAL-5/logistic_fn_indicator.cu
new file mode 100644
index 0000000..332e02e
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/logistic_fn_indicator.cu
@@ -0,0 +1,660 @@
+#include "logistic_fn_indicator.h"
+#include "cuda_utils.h"
+
+#include "mat_functions.h"
+
+#include "gen_random.h"
+#include "print_utils.h"
+
+#include "classification_kernels.h"
+
+void logistic_fn_indicator (real *features, SparseDataset *spFeatures, real *target, real *weights, real lambda, int rows, int cols, real *fn, real *devPtr, real *hostPtr)
+{
+	//host
+	real *alpha = hostPtr;
+	real *beta = alpha + 1;
+	real *nrm_weights = beta + 1;
+
+	//device
+	real *t = devPtr;
+	real *out = t + rows;	
+	real *redResult = out + rows; 
+
+	//features * weights
+	*alpha = 1;
+	*beta = 0;
+
+	if (spFeatures->valPtr == NULL) {
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols,
+				alpha, features, rows, 
+				weights, 1, 
+				beta, t, 1) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				weights, beta, t ) ); 
+	}
+
+/*
+	fprintf( stderr, "printing t    " ); 
+	printVector( t + rows - 1, 1, NULL); 
+	fprintf( stderr, "printing target    " ); 
+	printVector( target + rows - 1, 1, NULL ); 
+	fprintf( stderr, "printing out    " ); 
+	printVector( out + rows - 1,1, NULL ); 
+*/
+
+	ker_log_sum <<<BLOCKS, BLOCK_SIZE >>> ( t, target, rows, out);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	ker_reduction <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> (out, redResult, rows);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	ker_reduction <<< 1, BLOCKS_POW_2, BLOCKS_POW_2 * sizeof(real) >>> (redResult, fn, BLOCKS);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	//add the regularization term here. 
+	cublasCheckError( cublasDnrm2( cublasHandle, cols, weights, 1, nrm_weights) ); 
+
+	//since we are minimizing this function here. 
+	(*fn) += pow(*nrm_weights, 2.) * (lambda/2.0);
+}
+
+// sigma( x_ij( y_i - g(z_i))	
+// g(z_i) = sigmoid( x_ij * w_i )
+
+void logistic_fn_indicator_gx (real *features, SparseDataset *spFeatures, real *target, real *weights, real lambda, int rows, int cols, real *gn, real *devPtr, real *hostPtr, int samplingType, int numFeatures)
+{
+	//device
+	real *t = devPtr;
+
+	//host
+	real *alpha = hostPtr;
+	real *beta = alpha + 1;
+
+	//blocks
+	int numBlocks = BLOCKS; 
+	if (samplingType != 0)
+		numBlocks = rows / BLOCK_SIZE + ((rows % BLOCK_SIZE) == 0 ? 0 : 1); 
+
+	*alpha = 1;
+	*beta = 0;
+	if (spFeatures->valPtr == NULL) {	
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols,
+				alpha, features, rows, 
+				weights, 1, 
+				beta, t, 1) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				weights, beta, t ) ); 
+	}
+		
+	ker_sigmoid_target <<<numBlocks, BLOCK_SIZE >>> (t, target, rows, t);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+	
+	*alpha = 1;
+	*beta = 0; 
+	if (spFeatures->valPtr == NULL) {
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_T, rows, cols, 
+				alpha, features, rows, 
+				t, 1, 
+				beta, gn, 1) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				t, beta, gn ) ); 
+	}
+
+
+	// non uniform sampling scaling 
+        *alpha = ((real)numFeatures)/((real)rows);
+	if (samplingType == 2) {
+        	cublasCheckError( cublasDscal( cublasHandle, cols, alpha, gn, 1) );
+   	} else if (samplingType ==1 ){
+		cublasCheckError( cublasDscal( cublasHandle, cols, alpha, gn, 1) ); 
+	}
+
+	//regularization here. 
+	*alpha = lambda;
+	cublasCheckError( cublasDaxpy( cublasHandle, cols, alpha, weights, 1, gn, 1 ) );
+}
+
+GLOBAL void ker_hx_C_scale (real *A, real *B, real *C, int rows, real *scale )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx < rows){
+		C[ idx ] = (1. / scale[ idx ]) * ( A[ idx ] * B[ idx ] - B[ idx ] * ( A[ idx ] * B[ idx ] ) );
+	}
+}
+
+GLOBAL void ker_hx_C (real *A, real *B, real *C, int rows )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx < rows){
+		C[ idx ] = A[ idx ] * B[ idx ] - B[ idx ] * ( A[ idx ] * B[ idx ] );
+	}
+}
+
+void logistic_fn_indicator_hx_matvec (real *features, SparseDataset *spFeatures, real *weights, real *vector, 
+				real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr, int samplingType, real *scaleTerms, int numFeatures)
+{
+	real *A = devPtr; 
+	real *B = A + rows; 
+	real *C = B + rows; 
+
+	real alpha, beta; 
+
+	//blocks
+	int numBlocks = BLOCKS; 
+	if (samplingType != 0)
+		numBlocks = rows / BLOCK_SIZE + ((rows % BLOCK_SIZE) == 0 ? 0 : 1); 
+
+	//compute A = matrix * vector
+	alpha = 1; 
+	beta = 0; 
+	if (spFeatures->valPtr == NULL) {
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, 
+				&alpha, features, rows, 
+				vector, 1, 
+				&beta, A, 1) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				&alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				vector, &beta, A ) ); 
+	}
+
+	//compute B = Probability Matrix here. matrix * weights
+	if (spFeatures->valPtr == NULL) {
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, 
+				&alpha, features, rows, 
+				weights, 1, 
+				&beta, B, 1) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				&alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				weights, &beta, B ) ); 
+	}
+
+	ker_sigmoid<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> 
+		(B, rows, B);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	//Compute C = A.B - B.(A.B)
+	if (samplingType == 2) {
+        ker_hx_C_scale <<< numBlocks, BLOCK_SIZE >>> (A, B, C, rows, scaleTerms);
+	} else { 
+        ker_hx_C<<< numBlocks, BLOCK_SIZE >>> (A, B, C, rows);
+	}
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+	//compute X^T * C = matvec	
+	if (spFeatures->valPtr == NULL) {
+		alpha = 1.0; 
+		beta = 0; 
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_T, rows, cols, 
+				&alpha, features, rows, 
+				C, 1, 
+				&beta, hx, 1) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				&alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				C, &beta, hx ) ); 
+	}
+
+	//appropriate scaling
+	if (samplingType == 1){ 
+		alpha =  ((real)numFeatures/ ((real) rows)); 
+		cublasCheckError ( cublasDscal( cublasHandle, cols, &alpha, hx, 1 ) );
+	}
+
+	//regularization here. 
+	//this is a matrix operation. 
+	int colBlockSize = BLOCK_SIZE; 
+	int colBlocks = (cols % colBlockSize) == 0 ? (cols/colBlockSize) : (cols/colBlockSize + 1);
+	ker_hx_matvec_reg <<<colBlocks, colBlockSize>>>
+			(hx, lambda, vector, cols);	
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+}
+
+
+	
+void logistic_fn_indicator_hx (real *features, SparseDataset *spFeatures, real *target, real *weights, real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr)
+{
+	//device
+	real *t = devPtr;
+	real *t_minus = t + rows;
+	real *C = t_minus + rows;
+
+	//host
+	real *alpha = hostPtr;
+	real *beta = alpha + 1;
+
+	*alpha = 1;
+	*beta = 0;
+	if ( spFeatures == NULL) {
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols,
+				alpha, features, rows, 
+				weights, 1, 
+				beta, t, 1 ) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				weights, beta, t ) ); 
+	}
+	cublasCheckError( cublasDcopy( cublasHandle, rows, t, 1, t_minus, 1 ) );
+
+	//apply sigmoid here. 
+	ker_sigmoid<<<BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> 
+			(t, rows, t);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	//fprintf( stderr, "Output from the sigmoid function \n");
+	//printVector( t, rows, NULL);
+		
+	*alpha = -1;
+	cublasCheckError ( cublasDscal( cublasHandle, rows, alpha, t_minus, 1 ) );
+	ker_sigmoid<<<BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> 
+			(t_minus, rows, t_minus);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	//fprintf( stderr, "Output from the sigmoid function -t \n");
+	//printVector( t_minus, rows, NULL);
+
+	//element wise product of two vectors here. 
+	ker_ele_vec_product <<< BLOCKS, BLOCK_SIZE >>>
+			( t, t_minus, rows, t );
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+	//fprintf( stderr, "Output from the ele vector product\n");
+	//printVector( t, rows, NULL);
+
+	// perform the final mat * mat product here. 
+	// perform diag(s * neg_s) * features. 
+	if (spFeatures == NULL ){ 
+		cublasCheckError (cublasDdgmm( cublasHandle, CUBLAS_SIDE_LEFT, 
+					rows, cols, features, rows, 
+					t, 1, 				
+					C, rows) );
+		//perform the first. product( features^T x above_result);
+		*alpha = 1;
+		*beta = 0;
+		cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, 
+					cols, cols, rows, 
+					alpha, features, rows, 
+					C, rows, beta, hx, cols ) );			
+	} else {
+		//Not implemented here. 
+		//since we are using matvec here for Hessian
+		;
+	}
+
+	//regularization here. 
+	//this is a matrix operation. 
+	int colBlockSize = BLOCK_SIZE; 
+	int colBlocks = (cols % colBlockSize) == 0 ? colBlocks = cols/colBlockSize : colBlocks = cols/colBlockSize + 1;
+	//fprintf ( stderr, "Regularization BLOCKS --> %d and BlockSize -- > %d \n", colBlocks, colBlockSize );
+	ker_mat_identity <<<colBlocks, colBlockSize>>>
+			(hx, lambda, cols);	
+			//(hx, 2 * (lambda), cols);	
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+}
+
+///////////////////////////////////
+//Non uniform subsampling code here. 
+///////////////////////////////////
+
+int generateNonUniformSample_log( real *probs, real *scaleTerms, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr)
+{       
+        int count = 0;
+        real *devIndices = devPtr + rows;
+        
+        getRandomVector( rows, NULL, devPtr);
+        
+        ker_compute_probs <<< BLOCKS, BLOCK_SIZE >>>
+                        ( probs, rows, sampleSize, devPtr, devIndices );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        
+        copy_host_device( hostPtr, devIndices, sizeof(real) * rows,
+                                                cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST);
+        
+        for (int i = 0; i < rows; i ++){
+                if (hostPtr[i] != 0)
+                        selIndices[ count ++] = i;
+        }
+
+//fprintf( stderr, "selected points for non uniform sampling is %d \n", count ); 
+        
+        //prepare scaleTerms here.  
+        cuda_memset( scaleTerms, 0, sizeof(real) * rows, 0x99 );
+        cuda_memset( devIndices, 0, sizeof(real) * rows, 0x99 );
+        copy_host_device( selIndices, devIndices, sizeof(int) * count,
+                                        cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+        
+        int blocks = count / BLOCK_SIZE +
+                        ((count % BLOCK_SIZE) == 0 ? 0 : 1 );
+        ker_init_scaleTerms <<< blocks, BLOCK_SIZE >>>
+                        ( scaleTerms, count, probs, (int *)devIndices );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        
+        return count;
+}
+
+void computeRowProbabilities_log( SparseDataset *spfeatures, real *features, int rows, int cols,
+                        real *dHXW, real *rowNrms, real *probs, real *devPtr )
+{
+        ker_compute_dHXW_nrm_log <<< BLOCKS, BLOCK_SIZE >>>
+                ( dHXW, rowNrms, rows);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //reduce to compute the sum
+        reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE * sizeof (real) >>>
+                (dHXW, devPtr, rows );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof (real) >>>
+                (devPtr, devPtr + BLOCK_SIZE, BLOCKS);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        ker_normalize <<< BLOCKS, BLOCK_SIZE >>>
+                (dHXW, rows, devPtr + BLOCK_SIZE, probs );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+}
+
+
+void computeRowNorms_log( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr )
+{
+        if (features != NULL) {
+                ker_row_norms <<< BLOCKS, BLOCK_SIZE >>>
+                        ( features, rows, cols, rowNrms );
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+        } else {
+                cudaMemcpy( spfeatures->valPtr, spfeatures->sortedVals,
+                                sizeof(real) * spfeatures->nnz, cudaMemcpyDeviceToDevice );
+
+                int blocks = spfeatures->nnz / (BLOCK_SIZE) +
+                                ((spfeatures->nnz % (BLOCK_SIZE)) == 0 ? 0 : 1 );
+                ker_sqr_elements <<< blocks, BLOCK_SIZE >>>
+                        (spfeatures->valPtr, spfeatures->nnz, 1, devPtr);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                //matvec here. for row sums
+                real alpha = 1.0;
+                real beta = 0;
+
+                //init the vector here. 
+                blocks = cols / BLOCK_SIZE + (( cols % BLOCK_SIZE == 0) ? 0 : 1 );
+                ker_init_ones <<< blocks, BLOCK_SIZE >>>
+                                ( devPtr , cols );
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                cusparseCheckError(
+                        cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                        rows, cols, spfeatures->nnz,
+                                        &alpha, spfeatures->descr, spfeatures->valPtr, spfeatures->rowCsrPtr,
+                                        spfeatures->colPtr, devPtr, &beta, rowNrms)
+                                );
+                ker_sqrt_elements  <<< BLOCKS, BLOCK_SIZE >>>
+                                ( rowNrms, rows);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+        }
+}
+
+void computeHXW_log (SparseDataset *spfeatures, real *features, int rows, int cols, real *weights, real *B) {
+   real alpha; 
+   real beta; 
+
+   alpha = 1.0; 
+   beta = 0; 
+
+   if (spfeatures->valPtr == NULL) { 
+      cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols,
+            &alpha, features, rows,
+            weights, 1,  
+            &beta, B, 1) );
+   } else {
+      cusparseCheckError(
+         cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            rows, cols, spfeatures->nnz,
+            &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, spfeatures->colPtr,
+            weights, &beta, B ) );
+   }
+
+   //ker_sigmoid<<<BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>>
+   ker_sigmoid<<<BLOCKS, BLOCK_SIZE>>>
+      (B, rows, B);
+   cudaThreadSynchronize ();
+   cudaCheckError ();
+}
+
+
+      
+
+
+//
+//
+// PREDICTION HERE. For the Logistic Regression with Indicator random variable
+//			as the class label
+//
+void logistic_regression_predict( real *features, SparseDataset *spFeatures, real *weights, real *labels, real *hostLabels, int rows, int cols, real *accuracy, real *devPtr, real *hostPtr )
+{
+	real alpha, beta;
+	real *sigmoid_predictions = devPtr;
+	real nrm;
+	int counter0, counter1;
+
+	alpha = 1; 
+	beta = 0;
+	if (spFeatures->valPtr == NULL) {
+		cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols,  
+				&alpha, features, rows, 
+				weights, 1, 
+				&beta, sigmoid_predictions, 1 ) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				rows, cols, spFeatures->nnz, 	
+				&alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, 
+				weights, &beta, sigmoid_predictions ) ); 
+	}
+
+	//apply the sigmoid function here. 
+	int tblocks;
+	if (rows <= BLOCK_SIZE) 
+		tblocks = 1;
+	else 
+		tblocks = (rows % BLOCK_SIZE) == 0 ? rows / BLOCK_SIZE : (rows/BLOCK_SIZE) + 1;
+
+	//ker_sigmoid_classify<<<tblocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> (sigmoid_predictions, rows);
+	ker_sigmoid_classify<<<tblocks, BLOCK_SIZE >>> (sigmoid_predictions, rows);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	copy_host_device( hostPtr, sigmoid_predictions, sizeof(real) * rows, cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST );
+
+	*accuracy = 0;
+	counter0 = counter1 = 0; 
+	for (int i = 0; i < rows; i ++) 
+	{	
+		if (hostPtr[i] == (hostLabels[i] - 1.0)) (*accuracy) ++;
+		if (hostPtr[i] == 1.) counter1 ++; 
+		if (hostPtr[i] == 0.) counter0 ++;
+	}
+	//fprintf( stderr, "0: %d, 1: %d \n", counter0, counter1 ); 
+
+	*accuracy = ((*accuracy) / rows) * 100.;
+}
+
+
+
+////////////////////////////////////////////////////////
+//Derivative Test
+////////////////////////////////////////////////////////
+/*
+void getRandomVectorLogistic (int n, real *hostPtr, real *devPtr) {
+
+        curandGenerator_t gen ;
+        int m = n + n % 2;
+
+        curandCheckError ( curandCreateGenerator (&gen , CURAND_RNG_PSEUDO_DEFAULT ) );
+
+        curandCheckError ( curandSetPseudoRandomGeneratorSeed ( gen , 1234ULL )) ;
+
+        curandCheckError ( curandGenerateNormalDouble ( gen , devPtr , m, 0, .25)) ;
+        //curandCheckError ( curandGenerateUniformDouble ( gen , devPtr , m)) ;
+
+        copy_host_device( hostPtr, devPtr, sizeof(real) * m, cudaMemcpyDeviceToHost,
+                                ERROR_MEMCPY_DEVICE_HOST );
+
+        curandCheckError ( curandDestroyGenerator ( gen ) );
+}
+*/
+
+
+void logisticRegDerivativeTest ( real *features, real *target, int rows, int cols, 
+                        real *devPtr, real *hostPtr, real *pageLckPtr, int numpoints)
+{
+        int offset = cols % 4;
+
+        real *constPoint = hostPtr;
+        real *hostPoint = constPoint + cols + offset;
+        real *dx = hostPoint + cols + offset;
+        real *ferror = dx + cols + offset;
+        real *herror = ferror + numpoints;
+        real *dxs = herror + numpoints;
+        real *nextHostPtr = dxs + numpoints;
+
+        real *devPoint = devPtr;
+        real *devDx = devPoint + cols + offset;
+        real *gradient = devDx + cols + offset;
+        real *hessian = gradient + cols + offset;
+        real *nextDevPtr = hessian + cols * cols + offset;
+
+        real *vv = pageLckPtr;
+        real *vhv = vv + 1;
+        real *dxnrm = vhv + 1;
+	real *f = dxnrm + 1;
+	real *f0 = f + 1;
+        real *nextPagePtr = f0 + 1;
+
+        real alpha, beta;
+
+        fprintf( stderr, "Number of random numbers to be generated: %d \n", cols );
+
+        memset( constPoint, 0, sizeof(real) * cols );
+        for (int i = 0; i < cols; i ++)  constPoint[i] = 0.;
+
+        copy_host_device( constPoint, devPoint, sizeof(real) * cols,
+                                cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+
+        //getRandomVectorLogistic( cols, dx, nextDevPtr);
+        getRandomVector( cols, dx, nextDevPtr);
+        //for (int i = 0; i < cols; i ++)  dx[i] = 0;
+
+        //printHostVector( dx, cols );
+
+        //f0
+	//logistic_fn_indicator( features, target, devPoint, 0, rows, cols, f0, nextDevPtr, nextHostPtr);
+
+        //g0
+	//logistic_fn_indicator_gx( features, NULL, target, devPoint, 0, rows, cols, gradient, nextDevPtr, nextHostPtr); 
+        //printVector( gradient, 5, NULL );
+
+        //h0
+	//logistic_fn_indicator_hx( features, target, devPoint, 0, rows, cols, hessian, nextDevPtr, nextHostPtr );
+
+        fprintf( stderr, "Starting the derivative test .. %f\n", *f0);
+
+        for (int i = 0; i < numpoints; i ++) {
+
+                for (int j = 0; j < cols; j ++) hostPoint[j] = constPoint[j] + dx[j];
+
+                copy_host_device( hostPoint, devPoint, sizeof(real) * cols,
+                                cudaMemcpyHostToDevice, ERROR_MEMCPY_DEVICE_HOST);
+                copy_host_device( dx, devDx, sizeof(real) * cols,
+                                        cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+
+                //function evaluation here.
+		//logistic_fn_indicator( features, target, devPoint, 0, rows, cols, f, nextDevPtr, nextHostPtr);
+
+                //first order error
+                //printVector( gradient, 5, NULL );
+                //printVector( devPoint, 5, NULL );
+                //fprintf( stderr, "Gradient sum: %e \n", computeWeightSum( gradient, cols ));
+                cublasCheckError( cublasDdot( cublasHandle, cols, gradient, 1, devDx, 1, vv) );
+                ferror[i] = (*f - (*f0 + *vv)) / (real)rows;
+
+                //second order error
+                alpha = 1;
+                beta = 0;
+                cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, cols, cols,
+                                        &alpha, hessian, cols,
+                                        devDx, 1,
+                                        &beta, nextDevPtr, 1) );
+                *vhv= 0;
+                cublasCheckError( cublasDdot( cublasHandle, cols, devDx, 1, nextDevPtr, 1, vhv) );
+
+                herror[i] = (*f - (*f0 + *vv + 0.5 * (*vhv) )) / (real) rows;
+
+                fprintf( stderr, "%d: f --> %e, vv --> %e, vhv--> %e, ferr: %e, herr: %e \n",
+                                        i, *f, *vv, *vhv, ferror[i], herror[i] );
+
+                //dxs here. 
+                *dxnrm = 0;
+                cublasCheckError( cublasDnrm2( cublasHandle, cols, devDx, 1, dxnrm));
+                dxs[i] = *dxnrm;
+		//printVector( devDx, 10, NULL);
+		//fprintf( stderr, "DevDx norm is ----> %e, %e, %e \n", *dxnrm, pow( *dxnrm, 2.), pow(*dxnrm, 3.) );
+
+                for (int j = 0; j < cols; j ++) dx[j] = dx[j] / 2.0;
+                //break;
+        }
+
+        writeVector( ferror, numpoints, "./ferror.txt", 1 ); //host
+        writeVector( herror, numpoints, "./herror.txt", 1 ); //host
+
+        //write dx.^2 here
+        for (int j = 0; j < numpoints; j ++) constPoint[j] = pow(dxs[j], 2.);
+        writeVector( constPoint, numpoints, "./dxs_2.txt", 1 ); //host
+
+        //write dx.^3 here
+        for (int j = 0; j < numpoints; j ++) constPoint[j] = pow(dxs[j], 3.);
+        writeVector( constPoint, numpoints, "./dxs_3.txt", 1 ); //host
+}
+
+
diff --git a/code/cuda/RC-FINAL-5/logistic_fn_indicator.h b/code/cuda/RC-FINAL-5/logistic_fn_indicator.h
new file mode 100644
index 0000000..e6f9663
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/logistic_fn_indicator.h
@@ -0,0 +1,28 @@
+
+#ifndef __H_LOGISTIC_FN_INDICATOR__
+#define __H_LOGISTIC_FN_INDICATOR__
+
+#include "cuda_types.h"
+#include "dataset.h"
+
+void logistic_fn_indicator (real *features, SparseDataset *spfeatures, real *target, real *weights, real lambda, int rows, int cols, real *fx, real *devPtr, real *hostPtr);
+void logistic_fn_indicator_gx (real *features, SparseDataset *spfeatures, real *target, real *weights, real lambda, int rows, int cols, real *gx, real *devPtr, real *hostPtr, int samplingType, int numFeatures);
+void logistic_fn_indicator_hx_matvec (real *features, SparseDataset *spFeatures, real *weights, real *vector,
+                                real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr, int type, real *scale, int numFeatures);
+void logistic_fn_indicator_hx (real *features, real *target, real *weights, real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr);
+void logistic_regression_predict( real *, SparseDataset *, real *, real *, real *, int , int , real *, real *, real *);
+
+void logisticRegDerivativeTest ( real *features, real *target, int rows, int cols,
+                        real *devPtr, real *hostPtr, real *pageLckPtr, int numpoints);
+
+
+//Non uniform functions
+int generateNonUniformSample_log( real *probs, real *scaleTerms, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr);
+void computeRowProbabilities_log( SparseDataset *spfeatures, real *features, int rows, int cols,
+                        real *dHXW, real *rowNrms, real *probs, real *devPtr );
+void computeRowNorms_log( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr );
+void computeHXW_log (SparseDataset *spfeatures, real *features, int rows, int cols, real *weights, real *B );
+
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/mat_functions.cu b/code/cuda/RC-FINAL-5/mat_functions.cu
new file mode 100644
index 0000000..99e939a
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/mat_functions.cu
@@ -0,0 +1,126 @@
+#include "mat_functions.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+GLOBAL void ker_log_sum( real *t, real *target, int N, real *out)
+{
+	//extern __shared__ real sdata[];
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	real x = 0; 
+
+	if (idx < N) {	
+		x = t[ idx ]; 	
+		if (x <= 0)
+			out[ idx ] = log( 1. + exp(x) ) - ((target[idx] - 1.) * t[ idx ]); 
+		else 
+			out[ idx ] = ( x + log( exp(-x) + 1.) ) - ((target[idx] - 1.) * t[ idx] );
+	}
+}
+
+GLOBAL void ker_sigmoid( real *s, int N, real *out)
+{
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	real x = 0; 
+	real alpha = 0; 
+
+	if (idx < N) {
+		x = s[ idx ];
+		if ( x < 0 ) 
+			out[ idx ] = exp( x ) / (1. + exp(x) ); 
+		else
+			out[ idx ] = 1. / (1. + exp(-x) ); 
+	}
+}
+
+GLOBAL void ker_sigmoid_classify( real *s, int N )
+{
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	if (idx < N) {
+		if (s[ idx ] <= 0 ){ 
+			if (exp(s[idx])/ ( (1. + exp(s[idx]) )) > 0.5)
+				s[idx] = 1.;
+			else 
+				s[idx] = 0.;
+		} else {
+			if (1. / (1. + exp(-s[idx]) ) > 0.5)
+				s[idx] = 1.;
+			else 
+				s[idx] = 0.;
+		}
+	}
+}
+
+GLOBAL void ker_sigmoid_target( real *t, real *target, int N, real *out)
+{
+	real x = 0; 
+	real alpha = 0; 
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	if (idx < N) {
+		x = t[ idx ];
+		if (x < 0 ) 
+			out[idx] = ( exp(x)/ ( 1. + exp(x) )) - (target[ idx ] - 1.);
+		else
+			out[idx] = ( 1./ ( 1. + exp(-x) )) - (target[ idx ] - 1.);
+	}
+}
+
+GLOBAL void ker_ele_vec_product( real *t1, real *t2, int N, real *out)
+{
+	//extern __shared__ real sdata[];
+	//real x = 0; 
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	if (idx < N) out[ idx ] = t1[ idx ] * t2[ idx ];
+	//sdata[ threadIdx.x ] = x; 
+	//if (idx < N) out[idx] = sdata[threadIdx.x] ;
+}
+
+GLOBAL void ker_mat_identity( real *matrix, real gamma, int M)
+{
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (idx < M)
+		matrix[ idx * M + idx ] += gamma;
+}
+
+GLOBAL void ker_hx_matvec_reg ( real *hx, real gamma, real *vec, int c)
+{
+	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (idx < c) {
+		hx[ idx ]+= gamma * vec[ idx ];
+	}
+}
+
+
+GLOBAL void ker_reduction(const real *input, real *per_block_results, int n)
+{
+  extern __shared__ real sdata[];
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+  real x = 0;
+
+  if(i < n)
+  {
+    x = input[i];
+  }
+  sdata[threadIdx.x] = x;
+  __syncthreads();
+
+  for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+  {
+    if(threadIdx.x < offset)
+    {   
+      sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+    }   
+
+    __syncthreads();
+  }
+
+  if(threadIdx.x == 0)
+  {
+    per_block_results[blockIdx.x] = sdata[0];
+  }
+}
+
diff --git a/code/cuda/RC-FINAL-5/mat_functions.h b/code/cuda/RC-FINAL-5/mat_functions.h
new file mode 100644
index 0000000..9956166
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/mat_functions.h
@@ -0,0 +1,15 @@
+#ifndef __H_MAT_FUNCTIONS__
+#define __H_MAT_FUNCTIONS__
+
+#include "cuda_types.h"
+
+GLOBAL void ker_log_sum( real *t, real *target, int N, real *out);
+GLOBAL void ker_sigmoid( real *target, int N, real *out);
+GLOBAL void ker_sigmoid_classify( real *target, int N );
+GLOBAL void ker_sigmoid_target( real *t, real *target, int N, real *out);
+GLOBAL void ker_ele_vec_product( real *t1, real *t2, int N, real *out);
+GLOBAL void ker_mat_identity (real *h, real reg_term, int M);
+GLOBAL void ker_hx_matvec_reg ( real *hx, real gamma, real *vec, int c);
+GLOBAL void ker_reduction(const real *h, real *out, int dim);
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/newton-driver.c b/code/cuda/RC-FINAL-5/newton-driver.c
new file mode 100644
index 0000000..6115730
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/newton-driver.c
@@ -0,0 +1,345 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dataset.h"
+#include "sparse_dataset.h"
+
+#include "cuda_environment.h"
+#include "newton_cg.h"
+#include "utils.h"
+#include "cuda_utils.h"
+#include "logistic_fn_indicator.h"
+
+#include "softmax_multiclass.h"
+
+cublasHandle_t cublasHandle;
+cusparseHandle_t cusparseHandle; 
+int BLOCKS, BLOCK_SIZE, BLOCKS_POW_2;
+int HESSIAN_SAMPLING_SIZE, GRADIENT_SAMPLING_SIZE; 
+void *dscratch;
+
+int main(int argc, char **argv){
+	
+	// Data variables.
+	ForestDataset forestData;
+	DeviceDataset devData;
+	SCRATCH_AREA	scratch;
+	NEWTON_CG_PARAMS params;
+
+	real trainingTime_s, classificationTime_s;
+	real trainingTime_t, classificationTime_t;
+	int test_case_no = 1;
+	int nConIterations;
+	int DATASET_TYPE = 1; 
+
+	double l = 1e-6; 
+	int max_cg_iterations = -1; 
+	double cg_tolerance = 0; 
+	int sampling_flag = 0; 
+	int gpu = -1; 
+
+	if (argc <= 7) {
+		fprintf( stderr, "<exe> dataset lambda .... is the commnad \n"); 
+		exit (-1); 
+	}
+
+	DATASET_TYPE = atoi( argv[1] ); 
+	l = atof ( argv[2] ); 
+	max_cg_iterations = atoi (argv[3] ); 
+	cg_tolerance = atof( argv[4] ); 
+	sampling_flag = atoi( argv[5] );
+	gpu = atoi( argv[6] ); 
+	HESSIAN_SAMPLING_SIZE = atoi( argv[7] ); 
+	GRADIENT_SAMPLING_SIZE = atoi (argv[8] );
+
+        fprintf( stderr, "Dataset: %d, Lambda: %e, CGIterations: %d, CGTolerange: %e, SubSampling: %d, GPU: %d, HSample: %d, GSample: %d \n",
+                        DATASET_TYPE, l, max_cg_iterations, cg_tolerance, sampling_flag, gpu, HESSIAN_SAMPLING_SIZE, GRADIENT_SAMPLING_SIZE );
+
+
+	// Create the CUDA Environment Here. 
+	// Memory and device settings here. 
+	cuda_env_init (&scratch, gpu);
+	#ifdef __debug__
+	fprintf( stderr, "Scratch Area initialized ... \n");
+	#endif
+
+
+		switch( DATASET_TYPE ) {
+		
+                case 1:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_features.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_labels.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_features.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_labels.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 11:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_features.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_labels.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_features.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_labels.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 2:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_vec.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 12:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_vec.txt",
+                                &forestData, &scratch, 0, 0 );
+                        break;
+
+                case 3:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/mnist/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/mnist/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/mnist/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/mnist/test_vec.txt",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+
+                case 13:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/mnist/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/mnist/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/mnist/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/mnist/test_vec.txt",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+
+                case 4:
+                        readCIFARDataset(
+                                "/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/",
+                                "data_batch_", "test_batch.bin",
+                                &forestData, &scratch, 1 );
+                        break;
+
+                case 14:
+                        readCIFARDataset(
+                                "/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/",
+                                "data_batch_", "test_batch.bin",
+                                &forestData, &scratch, 0 );
+                        break;
+
+                case 5:
+                        readNewsgroupsDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/newsgroups/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/newsgroups/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/newsgroups/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/newsgroups/test_vec.txt",
+                                &forestData, &scratch, 0 );
+                        break;
+
+                case 15:
+                        readNewsgroupsDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_vec.txt",
+                                &forestData, &scratch, 0 );
+                        break;
+
+		//Logistic Datasets Here
+                case 6:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/mushrooms/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/mushrooms/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/mushrooms/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/mushrooms/test_vec.txt",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+                case 16:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/mushrooms/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/mushrooms/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/mushrooms/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/mushrooms/test_vec.txt",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+
+                case 7:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_vec.txt",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+                case 17:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/test_vec.txt",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+                case 8:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_train.data",
+                                "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_train.labels01",
+                                "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_valid.data",
+                                "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_valid.labels01",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+                case 18:
+                        readMultiDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_train.data",
+                                "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_train.labels01",
+                                "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_valid.data",
+                                "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_valid.labels01",
+                                &forestData, &scratch, 1, 0 );
+                        break;
+                case 9:
+                        readNewsgroupsDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/rcv1/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/rcv1/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/rcv1/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/rcv1/test_vec.txt",
+                                &forestData, &scratch, 1 );
+                        break;
+
+		//Sparse Logistic Datasets Here
+                case 10:
+                        readNewsgroupsDataset (
+                                "/home/skylasa/solvers/dataset/raw-data/real-sim/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/real-sim/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/real-sim/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/raw-data/real-sim/test_vec.txt",
+                                &forestData, &scratch, 1 );
+                        break;
+                case 20:
+                        readNewsgroupsDataset (
+                                "/home/skylasa/solvers/dataset/normalized-data/real-sim/train_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/real-sim/train_vec.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/real-sim/test_mat.txt",
+                                "/home/skylasa/solvers/dataset/normalized-data/real-sim/test_vec.txt",
+                                &forestData, &scratch, 1 );
+                        break;
+		}
+
+	#ifdef __debug__
+		fprintf( stderr, "Done with initialization of the dataset .... \n");
+		fprintf( stderr, "Blocks for %d data points... \n", forestData.rows);
+	#endif
+
+        	compute_blocks (&BLOCKS, &BLOCK_SIZE, forestData.trainSize);
+        	compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2);
+		if (BLOCKS_POW_2 < 32) BLOCKS_POW_2 = 32;
+	#ifdef __debug__
+		fprintf ( stderr, "Blocks: %d, BlockSize: %d, Power_2: %d\n", BLOCKS, BLOCK_SIZE, BLOCKS_POW_2);
+	#endif
+
+
+		// Move the data to the Device. 
+		if ((forestData.trainSet == NULL) && (forestData.testSet == NULL))
+		{	
+			initialize_device_data_sparse( &forestData, &devData );
+			initMatDescriptors ( &devData ); 
+			convertToCSR ( &devData, scratch.devWorkspace ); 
+
+			initMatDescriptorsForSampling( &devData ); 
+			initMatDescriptorsForSparseSampling( &devData ); 
+		} else {
+			initialize_device_data( &forestData, &devData );
+			initMatDescriptorsForSampling( &devData ); 
+		}
+		
+	#ifdef __debug__
+		fprintf( stderr, "Inittialized the Device with the dataset ... \n");
+	#endif
+
+		//Train the dataset here. 
+		params.max_iterations = 100;
+		params.tolerance = 1e-5;
+		params.iflag = 0;
+
+		params.lambda = l;
+		params.max_cg_iterations = max_cg_iterations; 
+		params.cg_tolerance = cg_tolerance;
+
+		if (GRADIENT_SAMPLING_SIZE == 100)
+			params.gx_sampling = 0; 
+		else
+			params.gx_sampling = sampling_flag; 
+		params.hx_sampling = sampling_flag;
+
+fprintf( stderr, " Gradient Sample: %d, Hessian Sample: %d \n", devData.gradientSampleSize, devData.hessianSampleSize ); 
+		if (sampling_flag == 0) {
+
+			devData.gradientSampleSize = 0; 
+			devData.hessianSampleSize = 0; 
+		}
+
+		fprintf( stderr, "Start of TestCase: %d\n", 	test_case_no);
+		trainingTime_s = Get_Time ();
+		/*
+		if (forestData.numclasses == 1) 
+			nConIterations = newton_cg( &forestData, &devData, &params, &scratch );
+		else 
+		*/
+			nConIterations = newton_cg_multi_optimized( &forestData, &devData, &params, &scratch);
+		trainingTime_t = Get_Timing_Info( trainingTime_s );	
+	#ifdef __debug__
+		fprintf( stderr, "Done with training .... \n");
+	#endif
+	
+		//exit (-1); 
+
+		//Predict the testing set here. 
+		real accuracy = 0;
+		classificationTime_s = Get_Time ();
+		/*
+		if (forestData.numclasses == 1) {
+			logistic_regression_predict( devData.testSet, &devData.spTest, devData.weights, devData.testLabels, 
+					forestData.testLabels, forestData.testSize, forestData.cols, 
+					&accuracy, scratch.devWorkspace, scratch.hostWorkspace );
+		} else {
+		*/
+			accuracy = softmax_predict(&devData.spTest, devData.testSet, forestData.testLabels, 
+					devData.weights, devData.testSize, devData.cols, devData.numclasses, 
+					scratch.hostWorkspace, scratch.devWorkspace, 1, forestData.testSet);
+		//}
+		classificationTime_t = Get_Timing_Info( classificationTime_s );
+		//fprintf( stderr, "Start of TestCase: %d\n", 	test_case_no);
+		fprintf( stderr, "Dataset: %d \n", 		DATASET_TYPE ); 
+		fprintf( stderr, "NumClasses: %d\n", 		devData.numclasses ); 
+		fprintf( stderr, "Lambda: %e\n", 		params.lambda );
+		fprintf( stderr, "NewtonIterations: %d\n", 	params.max_iterations);
+		fprintf( stderr, "NewtonTolerance: %e\n", 	params.tolerance);
+		fprintf( stderr, "CGIterations: %d\n", 		params.max_cg_iterations);
+		fprintf( stderr, "CGTolerance: %e\n", 		params.cg_tolerance );
+		fprintf( stderr, "DataSetSize: %d\n", 		forestData.rows );
+		fprintf( stderr, "TrainingSize: %d\n", 		forestData.trainSize);
+		fprintf( stderr, "Features: %d\n", 		forestData.cols );
+		fprintf( stderr, "TrainingTime: %d\n", 		(unsigned int)(trainingTime_t * 1000)  );
+		fprintf( stderr, "TestingSize: %d\n", 		forestData.testSize );
+		fprintf( stderr, "ClassificationTime: %d\n", 	(unsigned int)(classificationTime_t*1000)  );
+		fprintf( stderr, "TestAccuracy:  %3.2f\n", 	accuracy );
+		fprintf( stderr, "NewtonIterationsCon: %d\n", 	nConIterations );
+		fprintf( stderr, "NewtonConvergence: %d\n", 	(int)params.iflag );
+		fprintf( stderr, "End of TestCase: %d\n", 	test_case_no);
+		fprintf( stderr, "\n\n\n");
+
+		//cleanup the dataset pointers here. 
+		cleanup_dataset(&forestData, &devData );
+
+		test_case_no ++;
+
+	//Cleanup host/device Here. 
+	cuda_env_cleanup(&scratch);
+
+	return 0;
+}
diff --git a/code/cuda/RC-FINAL-5/newton_cg.c b/code/cuda/RC-FINAL-5/newton_cg.c
new file mode 100644
index 0000000..15835fa
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/newton_cg.c
@@ -0,0 +1,425 @@
+#include <newton_cg.h>
+#include "logistic_fn_indicator.h"
+
+#include "cuda_utils.h"
+#include "conjugate_gradient.h"
+#include "linesearch.h"
+
+#include "print_utils.h"
+#include "logistic_fn_indicator.h"
+#include "utils.h"
+
+#include "softmax_multiclass.h"
+#include "subsampling_helpers.h"
+#include "sparse_dataset.h"
+
+#define ALLOTED_TIME (120 * 60)
+
+int newton_cg( ForestDataset *host, DeviceDataset *data, NEWTON_CG_PARAMS *params, SCRATCH_AREA *scratch){
+
+	int iterations, cg_iterations; 
+	real snorm, gxnorm, rel_residual, best_rel_residual; 
+	real alpha, alphak;
+
+	real train_accuracy, test_accuracy;
+	real iteration_start, iteration_total, simulation_total;
+
+	//device
+	real *devPtr = (real *)scratch->devWorkspace;
+	real *xx = devPtr;
+	real *s = xx + data->cols;
+	real *s_best = s + data->cols;
+	real *gradient = s_best + data->cols;
+	//real *hessian = gradient+ data->cols;
+	//real *nextDevPtr = hessian + (data->cols * data->cols);
+	real *nextDevPtr = gradient + data->cols; 
+
+	real *nextHostPtr = (real *)scratch->hostWorkspace;
+
+	//pageLock
+	real *train_function, *test_function; 
+	train_function = scratch->pageLckWorkspace; 
+	test_function = & (scratch->pageLckWorkspace[1] );
+
+	//Subsampling here. 
+	//extract the subsampled gradient here. 
+fprintf( stderr, "Running the Logistic Regression..... solver, %d, %d, %d, %d \n", data->rows, data->cols, data->numclasses, data->testSize); 
+
+	//1.  get the hessian and gradient. 
+	if (params->gx_sampling >= 1) {
+
+		data->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * data->rows) / 100; 
+
+		if (data->trainSet != NULL && data->testSet != NULL) {
+                	prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                	convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+                	sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, 
+				data->numclasses, data->sampledGradientTrainSet, data->gradientSampleSize);
+		} else {
+			//handle sparse datasets here. 
+                	prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                	convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+
+			sampleSparseDataset( &data->spGradientSample, &data->spTrain, 
+					data->rows, data->cols, data->numclasses, 
+					&data->spSampledGradientTrain, data->gradientSampleSize ); 
+			fprintf( stderr, "Done extracting the sparse dataset ..... \n"); 
+		}
+		logistic_fn_indicator_gx( data->sampledGradientTrainSet, &data->spSampledGradientTrain, data->sampledGradientTrainLabels, 
+						data->weights, params->lambda, data->gradientSampleSize, data->cols, gradient, 
+						nextDevPtr, nextHostPtr, params->gx_sampling, data->rows ); 
+					
+	} else {
+		logistic_fn_indicator_gx( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, 
+						data->rows, data->cols, gradient, nextDevPtr, nextHostPtr, params->gx_sampling, data->rows);
+	}
+
+	//norm of gradient. 
+	cublasCheckError( cublasDnrm2( cublasHandle, data->cols, gradient, 1, &gxnorm ));
+
+	iterations = 0;
+	snorm = 100;
+	//gxnorm = 100;
+
+	rel_residual = 0; 
+	best_rel_residual = 0; 
+	train_accuracy = 0; 
+	*train_function = 0; 
+	test_accuracy = 0; 
+	*test_function = 0; 
+	iteration_total = 0; 
+	simulation_total = 0;
+
+#ifdef __debug__
+	fprintf( stderr, "iteration \t norm(gradient) \t Rel_Residual \t CG-ITERATIONS \t Train_Accu \t Obj_Val_Train \t Test_Accu \t Obj_Val_Test \n");
+
+        logistic_regression_predict( data->trainSet, &data->spTrain, data->weights, data->trainLabels,
+                                        host->trainLabels, host->trainSize, host->cols,
+                                        &train_accuracy, nextDevPtr, nextHostPtr );
+        logistic_regression_predict( data->testSet, &data->spTest, data->weights, data->testLabels,
+                                        host->testLabels, host->testSize, host->cols,
+                                        &test_accuracy, nextDevPtr, nextHostPtr );
+
+	logistic_fn_indicator( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, data->rows, data->cols, train_function, nextDevPtr, nextHostPtr);
+	logistic_fn_indicator( data->testSet, &data->spTest, data->testLabels, data->weights, params->lambda, data->testSize, data->cols, test_function, nextDevPtr, nextHostPtr);
+
+	fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", 
+			iterations, gxnorm, rel_residual, 0, train_accuracy, *train_function, 
+			test_accuracy, *test_function, (unsigned int)(iteration_total * 1000) );
+#endif
+
+	while (iterations < params->max_iterations){
+
+		iteration_start = Get_Time( );
+
+		//alpha = -1.;
+		//cublasCheckError ( cublasDscal( cublasHandle, data->cols, &alpha, gradient, 1) );
+		
+		//conjugate gradient to solve Hx = gradient here. 
+		cuda_memset( s_best, 0, data->cols, ERROR_MEM_SET );
+		cuda_memset( s, 0, data->cols, ERROR_MEM_SET );
+		cg_iterations = Cublas_CG_Logistic( data, params, gradient, s, s_best, &rel_residual, 
+						nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); 
+
+		alphak = cg_linesearch( s_best, data->weights, 0.5, 1e-6, &data->spTrain, 
+				(real *)data->trainSet, (real *)data->trainLabels, 
+				params->lambda, data->rows, data->cols, data->numclasses, 
+				gradient, xx, nextDevPtr, nextHostPtr, (real *)scratch->pageLckWorkspace);
+
+//fprintf( stderr, "alphaK --> %e \n", alphak ); 
+		
+		alpha = alphak;
+		cublasCheckError( cublasDaxpy( cublasHandle, data->cols, &alpha, s_best, 1, data->weights, 1) );
+
+		if (params->gx_sampling >= 1) {
+			if (data->trainSet != NULL && data->testSet != NULL) {
+                		prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                			convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+                		sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, 
+					data->numclasses, data->sampledGradientTrainSet, data->gradientSampleSize);
+			} else {
+				//handle sparse datasets here. 
+                		prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                		convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+
+				sampleSparseDataset( &data->spGradientSample, &data->spTrain, 
+					data->rows, data->cols, data->numclasses, 
+					&data->spSampledGradientTrain, data->gradientSampleSize ); 
+			}
+			logistic_fn_indicator_gx( data->sampledGradientTrainSet, &data->spSampledGradientTrain, data->sampledGradientTrainLabels, 
+						data->weights, params->lambda, data->gradientSampleSize, data->cols, gradient, 
+						nextDevPtr, nextHostPtr, params->gx_sampling, data->rows ); 
+					
+		} else {
+			logistic_fn_indicator_gx( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, 
+						data->rows, data->cols, gradient, nextDevPtr, nextHostPtr, params->gx_sampling, data->rows);
+		}
+
+		cublasCheckError( cublasDnrm2( cublasHandle, data->cols, gradient, 1, &gxnorm ));
+		cublasCheckError( cublasDnrm2( cublasHandle, data->cols, s_best, 1, &snorm));
+
+#ifdef __debug__
+
+		iteration_total = Get_Timing_Info( iteration_start );
+		simulation_total += iteration_total;
+
+                logistic_regression_predict( data->trainSet, &data->spTrain, data->weights, data->trainLabels,
+                                        host->trainLabels, host->trainSize, host->cols,
+                                        &train_accuracy, nextDevPtr, nextHostPtr );
+                logistic_regression_predict( data->testSet, &data->spTest, data->weights, data->testLabels,
+                                        host->testLabels, host->testSize, host->cols,
+                                        &test_accuracy, nextDevPtr, nextHostPtr );
+
+		logistic_fn_indicator( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, 
+					data->rows, data->cols, train_function, nextDevPtr, nextHostPtr);
+		logistic_fn_indicator( data->testSet, &data->spTest, data->testLabels, data->weights, params->lambda, 
+					data->testSize, data->cols, test_function, nextDevPtr, nextHostPtr);
+
+		fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", 
+			iterations+1, gxnorm, rel_residual, cg_iterations, train_accuracy, *train_function, 
+			test_accuracy, *test_function, (unsigned int)(iteration_total * 1000) );
+	
+#endif
+
+		iterations ++; 
+		if (gxnorm <= params->tolerance) break;
+
+		if (((unsigned int)(simulation_total)) >=  ALLOTED_TIME ) {
+			fprintf( stderr, "Exceeded the Time limitation for the simulation..... %d, %d \n", ((unsigned int)(simulation_total )), ALLOTED_TIME ); 
+			break;
+		}
+	}
+
+	if (gxnorm >= params->tolerance)
+		params->iflag = 1;
+
+	return iterations;
+}
+
+
+
+int newton_cg_multi_optimized( ForestDataset *host, DeviceDataset *data, NEWTON_CG_PARAMS *params, SCRATCH_AREA *scratch){
+
+	int iterations, cg_iterations; 
+	real snorm, gxnorm, rel_residual, temp; 
+	real alpha, alphak;
+
+	real best_rel_residual; 
+
+#ifdef __STATISTICS__
+	//statistics here. 
+	real train_accuracy, train_function; 
+	real test_accuracy, test_function; 
+	real iteration_start, iteration_total, simulation_total;
+#endif
+	
+	int classes_to_solve = data->numclasses;
+
+	//device
+	real *xx = (real *)scratch->devWorkspace;
+	real *s = xx + data->cols * classes_to_solve; 
+	real *s_best = s + data->cols * classes_to_solve; 
+
+	//auxiliary storeage 
+	real *gradient = s_best + data->cols * classes_to_solve; 
+	real *Hv = gradient + data->cols * classes_to_solve; 
+	real *HXW = Hv + classes_to_solve * data->cols; 
+	//real *expSumVec = XW + rows * classes_to_solve; 
+
+	//scratch area 
+	real *nextDevPtr = HXW + data->rows* classes_to_solve; 
+	real *nextHostPtr = (real *)scratch->hostWorkspace;
+	real *nextPageLckPtr = (real *) scratch->pageLckWorkspace; 
+
+	real s_norm, s_best_norm; 
+
+
+	//1.  get the hessian and gradient. 
+	if (params->hx_sampling >= 1) 
+		data->hessianSampleSize = (HESSIAN_SAMPLING_SIZE * data->rows)/100; 
+
+	if (params->gx_sampling >= 1) {
+
+		data->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * data->rows) / 100; 
+		data->spGradientSample.nnz = data->gradientSampleSize; 
+
+		
+		if (data->trainSet != NULL && data->testSet != NULL) {
+                	prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                	convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+                	sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, 
+				classes_to_solve, data->sampledGradientTrainSet, data->gradientSampleSize);
+		} else {
+			//handle sparse datasets here. 
+                	prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                	convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+
+			sampleSparseDataset( &data->spGradientSample, &data->spTrain, 
+					data->rows, data->cols, classes_to_solve, 
+					&data->spSampledGradientTrain, data->gradientSampleSize ); 
+		}
+
+		softmax_multiclass_gx_subsampled(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols,
+                               classes_to_solve, data->weights, params->lambda,
+				gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace, 
+				&data->spGradientSample, data->sampledGradientTrainSet, &data->spSampledGradientTrain, 
+				data->sampledGradientTrainLabels, data->gradientSampleSize, params->gx_sampling);
+		printVector( gradient, 10, NULL ); 
+		
+	} else {
+		computeHXW(&data->spTrain, data->trainSet, data->rows, data->cols, classes_to_solve, data->weights, HXW, 0 ); 
+
+		softmax_multiclass_gx_optimized(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols,
+                               classes_to_solve, data->weights, params->lambda, HXW, 
+				gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace);
+	}
+	//printVector( gradient, 20, NULL ); 
+	/*
+	softmax_multiclass_gx(data->trainSet, data->trainLabels, data->rows, data->cols,
+                                classes_to_solve, data->weights, params->lambda,
+				gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace);
+	*/
+
+	//2. Initialization Here. 
+	iterations = 0;
+	snorm = 100;
+	gxnorm = 100;
+	rel_residual = 100; 
+
+	cublasCheckError( cublasDnrm2( cublasHandle, classes_to_solve * data->cols, gradient, 1, &gxnorm ));
+
+#ifdef __STATISTICS__
+	iteration_total = 0; 
+	simulation_total = 0; 
+
+	test_function = softmax_multiclass_fx (&data->spTest, data->testSet, data->testLabels, data->testSize, data->cols, 
+		classes_to_solve, data->weights, params->lambda,
+		nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); 
+	train_function = softmax_multiclass_fx (&data->spTrain, data->trainSet, data->trainLabels, host->trainSize, data->cols, 
+			classes_to_solve, data->weights, params->lambda,
+			nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); 
+
+
+        test_accuracy = softmax_predict(&data->spTest, data->testSet, host->testLabels, data->weights, data->testSize,
+                                data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL);
+	train_accuracy = softmax_predict( &data->spTrain, data->trainSet, host->trainLabels, data->weights, host->trainSize, 
+				data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL ); 
+
+
+	fprintf( stderr, "iteration \t norm(gradient) \t Rel_Residual \t CG-ITERATIONS \t Train_Accu \t Obj_Val_Train \t Test_Accu \t Obj_Val_Test \n");
+	fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", 
+			iterations, gxnorm, rel_residual, 0, train_accuracy, train_function, 
+			test_accuracy, test_function, (unsigned int)(iteration_total * 1000) );
+
+#endif
+
+	while (iterations < params->max_iterations){
+
+#ifdef __STATISTICS__
+		//statistics Here. 
+		iteration_start = Get_Time( );
+#endif
+		//negative gradient
+		alpha = -1.;
+		cublasCheckError ( cublasDscal( cublasHandle, classes_to_solve * data->cols, &alpha, gradient, 1) );
+
+		cuda_memset( s, 0, classes_to_solve * data->cols * sizeof(real), ERROR_MEM_SET );
+		cuda_memset( s_best, 0, classes_to_solve * data->cols * sizeof(real), ERROR_MEM_SET );
+
+		cg_iterations = Cublas_CG_multi_optimized( &data->spTrain, data->trainSet, gradient, data->weights, s, s_best, params->lambda, 
+					data->rows, data->cols, classes_to_solve, HXW, 
+					nextDevPtr, nextHostPtr, scratch->pageLckWorkspace, 
+					params->max_cg_iterations, params->cg_tolerance, &rel_residual, &best_rel_residual, 
+					&data->spHessianSample, data->sampledHessianTrainSet, 
+					&data->spSampledHessianTrain, data->hessianSampleSize, params->hx_sampling); 
+
+		//compute the relative residual here. 
+		// || H*x - g || / || g ||
+		cublasCheckError( cublasDnrm2( cublasHandle, classes_to_solve * data->cols, gradient, 1, &gxnorm ));
+
+		//change gradient back
+		alpha = -1.;
+		cublasCheckError ( cublasDscal( cublasHandle, classes_to_solve * data->cols, &alpha, gradient, 1) );
+		alphak = cg_linesearch( s_best, data->weights, 0.5, 1e-6, &data->spTrain, (real *)data->trainSet, (real *)data->trainLabels, 
+					params->lambda, data->rows, data->cols, classes_to_solve, gradient, xx, 
+					nextDevPtr, nextHostPtr, (real *)scratch->pageLckWorkspace);
+
+		alpha = alphak;
+		cublasCheckError( cublasDaxpy( cublasHandle, classes_to_solve * data->cols, &alpha, s_best, 1, data->weights, 1) );
+
+		
+
+		if (params->gx_sampling >= 1) {
+
+			if (data->trainSet != NULL && data->testSet != NULL) {
+                		prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+						data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                		convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+                		sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, 
+					classes_to_solve, data->sampledGradientTrainSet, data->gradientSampleSize);
+			} else {
+				//handle sparse datasets here. 
+                		prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, 
+					data->rows, data->gradientSampleSize, (int *)nextHostPtr);
+                		convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr );
+
+				sampleSparseDataset( &data->spGradientSample, &data->spTrain, 
+					data->rows, data->cols, classes_to_solve, 
+					&data->spSampledGradientTrain, data->gradientSampleSize ); 
+			}
+
+			softmax_multiclass_gx_subsampled(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols,
+                               classes_to_solve, data->weights, params->lambda,
+				gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace, 
+				&data->spGradientSample, data->sampledGradientTrainSet, &data->spSampledGradientTrain, 
+				data->sampledGradientTrainLabels, data->gradientSampleSize, params->gx_sampling);
+	
+		} else {
+			//update here. 
+			computeHXW( &data->spTrain, data->trainSet, data->rows, data->cols, classes_to_solve, data->weights, HXW, 0 ); 
+
+			softmax_multiclass_gx_optimized(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols,
+                                classes_to_solve, data->weights, params->lambda, HXW, 
+				gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace);
+		}
+		
+#ifdef __STATISTICS__
+		iteration_total = Get_Timing_Info( iteration_start );
+		simulation_total += iteration_total;
+		//fprintf( stderr, "Total time per iteration ---- > %f \n", iteration_total ); 
+
+		//per iteration statistics here. 
+                test_accuracy = softmax_predict(&data->spTest, data->testSet, host->testLabels, data->weights, data->testSize,
+                                data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL);
+		train_accuracy = softmax_predict( &data->spTrain, data->trainSet, host->trainLabels, data->weights, host->trainSize, 
+				data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL ); 
+		test_function = softmax_multiclass_fx(&data->spTest, data->testSet, data->testLabels, data->testSize, data->cols, 
+			classes_to_solve, data->weights, params->lambda, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); 
+		train_function = softmax_multiclass_fx(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols, 
+			classes_to_solve, data->weights, params->lambda, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); 
+
+		fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", 
+			iterations+1, gxnorm, rel_residual, cg_iterations, 
+			train_accuracy, train_function, test_accuracy, test_function, (unsigned int)(iteration_total * 1000) );
+#endif
+
+		iterations ++; 
+		if (gxnorm <= params->tolerance) break;
+
+		if (((unsigned int)(simulation_total )) >=  ALLOTED_TIME ) {
+			fprintf( stderr, "Exceeded the Time limitation for the simulation..... %d, %d \n", ((unsigned int)(simulation_total )), ALLOTED_TIME ); 
+			break;
+		}
+	}
+
+	if (gxnorm >= params->tolerance)
+		params->iflag = 1;
+
+	return iterations;
+}
diff --git a/code/cuda/RC-FINAL-5/newton_cg.h b/code/cuda/RC-FINAL-5/newton_cg.h
new file mode 100644
index 0000000..d3af91f
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/newton_cg.h
@@ -0,0 +1,25 @@
+#ifndef __H_NEWTON_CG__
+#define __H_NEWTON_CG__
+
+#include "cuda_types.h"
+#include "dataset.h"
+
+typedef struct cg_params{
+	int max_iterations; 
+	int max_cg_iterations;
+	real tolerance; 
+	real cg_tolerance;
+	real iflag;
+	real lambda;
+	
+	//Subsampling
+	int gx_sampling; 
+	int hx_sampling; 
+
+} NEWTON_CG_PARAMS;
+
+int newton_cg( ForestDataset *, DeviceDataset *, NEWTON_CG_PARAMS *, SCRATCH_AREA *);
+int newton_cg_multi_optimized( ForestDataset *host, DeviceDataset *data, NEWTON_CG_PARAMS *params, SCRATCH_AREA *scratch );
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/notes.txt b/code/cuda/RC-FINAL-5/notes.txt
new file mode 100644
index 0000000..5932e09
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/notes.txt
@@ -0,0 +1,6 @@
+This is branched from rc-beta. 
+rc-beta is brought in from newton.cs.purdue.edu machine. 
+rc-beta is the first release version which fixed issues with subsampling bug. 
+rc-beta and rc-alpha suffers from the same problem for mushrooms dataset, where
+for subsampling/nonuniform sampling for 100 iterations, the cost function actually 
+increases. This needs to be looked into. MATLAB version just works fine. 
diff --git a/code/cuda/RC-FINAL-5/print_utils.c b/code/cuda/RC-FINAL-5/print_utils.c
new file mode 100644
index 0000000..19c093b
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/print_utils.c
@@ -0,0 +1,206 @@
+#include "print_utils.h"
+#include "cuda_utils.h"
+
+#include "string.h"
+
+real computeWeightSum( real *src, int len){
+	real *t = (real *)dscratch;
+	copy_host_device( t, src, len * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+	real s = 0; 
+	
+	for (int i=0 ; i < len; i ++) s += t[i];
+	return s; 
+}
+
+void printVector( real *src, int c, real *r){
+	real *t = (real *)dscratch;
+	int count = c;// > 20 ? 20 : c;
+	copy_host_device( t, src, c * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+
+	for (int i = 0; i < count; i ++){
+		if ((i % 20 == 0) && (i != 0)) fprintf (stderr, "\n");
+		fprintf( stderr, " %e ", t[i] );
+	}
+	fprintf (stderr, "\n");
+}
+
+void printCustomVector( real *src, int c, int jump){
+	real *t = (real *)dscratch;
+	int count = c;
+	copy_host_device( t, src, c * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+
+	for (int i = 0; i < count; i += jump){
+		fprintf( stderr, " %f ", t[i] );
+	}
+	fprintf (stderr, "\n");
+}
+
+void printIntVector( int *src, int c, int *r){
+	int *t = (int *)dscratch;
+	int count = c;// > 20 ? 20 : c;
+	copy_host_device( t, src, c * sizeof(int), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+
+	for (int i = 0; i < count; i ++){
+		if ((i % 20 == 0) && (i != 0)) fprintf (stderr, "\n");
+		fprintf( stderr, " %d ", t[i] );
+	}
+	fprintf (stderr, "\n");
+}
+
+void printHostVector( real *src, int c ){
+	real *t = src;
+	int count = c;// > 20 ? 20 : c;
+
+	for (int i = 0; i < count; i ++){
+		if ((i % 20 == 0) && (i != 0)) fprintf (stderr, "\n");
+		fprintf( stderr, " %e ", t[i] );
+	}
+	fprintf (stderr, "\n");
+}
+
+void writeMatrix ( real *mat, int rows )
+{
+        FILE *dataset_file;
+	real *t = (real *) dscratch;
+
+        if ( (dataset_file = fopen("./hessian.txt", "w")) == NULL ) {
+                fprintf( stderr, "Error opening the hessian.... !\n" );
+                exit( -1 );
+        }
+
+	fprintf (stderr, "Copying data to host \n");
+	copy_host_device( t, mat, rows * rows * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+	fprintf (stderr, "Done Copying data to host \n");
+
+        for (int i = 0; i < rows; i ++){
+                fprintf (dataset_file, "%6.2f", t[ i * rows ] );
+                for (int j = 1; j < rows; j ++){
+                        fprintf( dataset_file, ",%6.2f", t[ i * rows + j ] );
+                }
+                fprintf( dataset_file, "\n");
+        }
+        fclose (dataset_file);
+}
+
+void writeSparseMatrix (real *dataPtr, int *rowIndex, int *colIndex, int m, int n, int nnz )
+{
+        FILE *dataset_file;
+	int *t = (int *) dscratch;
+	real *t1 = (real *) dscratch;
+
+        if ( (dataset_file = fopen("./rowindex.txt", "w")) == NULL ) {
+                fprintf( stderr, "Error opening the hessian.... !\n" );
+                exit( -1 );
+        }
+
+	fprintf (stderr, "Copying data to host \n");
+	copy_host_device( t, rowIndex, sizeof(int) * (m + 1), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+	fprintf (stderr, "Done Copying data to host \n");
+	
+	for (int i = 0; i < m + 1; i ++){
+        	fprintf( dataset_file, "%d\n", t[ i ] );
+	}
+	fclose (dataset_file);
+	
+        if ( (dataset_file = fopen("./colindex.txt", "w")) == NULL ) {
+                fprintf( stderr, "Error opening the hessian.... !\n" );
+                exit( -1 );
+        }
+
+	fprintf (stderr, "Copying data to host \n");
+	copy_host_device( t, colIndex, sizeof(int) * (nnz), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+	fprintf (stderr, "Done Copying data to host \n");
+
+	for (int i = 0; i < nnz; i ++){
+        	fprintf( dataset_file, "%d\n", t[ i ] );
+	}
+	fclose (dataset_file);
+
+        if ( (dataset_file = fopen("./data.txt", "w")) == NULL ) {
+                fprintf( stderr, "Error opening the hessian.... !\n" );
+                exit( -1 );
+        }
+
+	fprintf (stderr, "Copying data to host \n");
+	copy_host_device( t1, dataPtr, sizeof(real) * (nnz), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+	fprintf (stderr, "Done Copying data to host \n");
+
+	for (int i = 0; i < nnz; i ++){
+        	fprintf( dataset_file, "%6.10f\n", t1[ i ] );
+	}
+	fclose (dataset_file);
+}
+
+void writeVector ( real *mat, int rows, char *file, int hostData )
+{
+        FILE *dataset_file;
+	real *t = (real *) dscratch;
+
+        if ( (dataset_file = fopen( file, "w")) == NULL ) {
+                fprintf( stderr, "Error opening the path .... !\n" );
+                exit( -1 );
+        }
+
+	if (hostData == 1) {
+		t = mat;
+	} else {
+		fprintf (stderr, "Copying data to host \n");
+		copy_host_device( t, mat, rows * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+		fprintf	(stderr, "Done Copying data to host \n");
+	}
+
+        for (int i = 0; i < rows; i ++){
+        	fprintf( dataset_file, "%e\n", t[ i ] );
+        }
+        //fprintf( dataset_file, "\n");
+        fclose (dataset_file);
+}
+
+int readVector( real *vec, int rows, char *file, int offset ){
+	FILE *handle; 
+	char line[1024];
+	int index = 0; 
+	char *word;
+	
+	if ( (handle= fopen( file, "r" )) == NULL ) {
+		fprintf( stderr, "Error opening the path... \n");
+		exit(-1); 
+	}
+
+	index = 0; 
+        while (!feof( handle )){
+                memset( line, 0, 1024);
+                fgets( line, 1024, handle);
+                if (line[0] == 0) break;
+	
+		word = strtok( line, "\n"); 
+		vec[ index ++ ] = atof( word ) + offset; 
+
+		if (index >= rows) break;
+        }
+        fclose( handle );	
+
+	return index;
+}
+
+void writeIntVector ( int *mat, int rows )
+{
+        FILE *dataset_file;
+	int *t = (int *) dscratch;
+
+        if ( (dataset_file = fopen( "./vector.txt", "w")) == NULL ) {
+                fprintf( stderr, "Error opening the path .... !\n" );
+                exit( -1 );
+        }
+
+	fprintf (stderr, "Copying data to host \n");
+	copy_host_device( t, mat, rows * sizeof(int), cudaMemcpyDeviceToHost, ERROR_DEBUG);	
+	fprintf (stderr, "Done Copying data to host \n");
+
+        for (int i = 0; i < rows; i ++){
+        	fprintf( dataset_file, "%d\n", t[ i ] );
+        }
+        //fprintf( dataset_file, "\n");
+        fclose (dataset_file);
+}
+
diff --git a/code/cuda/RC-FINAL-5/print_utils.h b/code/cuda/RC-FINAL-5/print_utils.h
new file mode 100644
index 0000000..8080f86
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/print_utils.h
@@ -0,0 +1,21 @@
+#ifndef __H_PRINT_UTILS__
+#define __H_PRINT_UTILS__
+
+#include "cuda_types.h"
+
+void printVector( real *src, int s, real *t );
+void printCustomVector( real *src, int s, int jump );
+void printIntVector( int *src, int s, int *t );
+void printHostVector( real *src, int s  );
+void writeMatrix (real *mat, int c);
+void writeVector (real *mat, int c, char *file, int );
+void writeIntVector (int *mat, int c );
+void writeSparseMatrix (real *dataPtr, int *rowIndex, int *colIndex, int m, int n, int nnz );
+
+real computeWeightSum( real *weights, int len ); 
+
+int readVector( real *vec, int rows, char *file, int offset );
+
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/readMatVec.cc b/code/cuda/RC-FINAL-5/readMatVec.cc
new file mode 100644
index 0000000..575cb78
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/readMatVec.cc
@@ -0,0 +1,130 @@
+#ifndef __H_MATVEC__
+#define __H_MATVEC__
+
+#include "readMatVec.h"
+#include <string.h>
+
+#define MAX_LINE 1024
+
+void readMatVec( char *matrixPath, char *vectorPath, double **matrix, double **vector, int *N){
+
+	//read the CSV file here and create
+	//matrix and vector files and pass
+	//them back to the main file
+	FILE *matFile;
+	char line[MAX_LINE];
+	int numLines = 0;
+	int index = 0;
+	double *fileMatrix;
+	double *fileVector;
+
+   if ( (matFile = fopen(matrixPath, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the pdb file!\n" );
+		exit( -1 );
+	}
+
+	while (!feof( matFile ) ){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, matFile );
+		if (line[0] == 0) break;
+		numLines ++;
+	}
+	fprintf( stderr, " Number of lines read: %d \n", numLines );
+
+	*N = numLines;
+	fileMatrix = (double *) malloc( sizeof(double) * (numLines) * (numLines) );
+
+	//read the file here and fill the matrix. 
+	rewind( matFile );	
+	while (!feof( matFile )){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, matFile);
+		if (line[0] == 0) break;
+		tokenize( line, fileMatrix, &index );
+	}
+
+	fclose( matFile );
+	fprintf( stderr, "Number of elements: %d\n", index );
+
+	//read teh vector here. 
+	fileVector = (double *) malloc (sizeof(double) * (numLines) );
+   if ( (matFile = fopen(vectorPath, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the pdb file!\n" );
+		exit( -1 );
+	}
+
+	index = 0;
+	while (!feof( matFile )){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, matFile);
+		if (line[0] == 0) break;
+		fileVector[index ++] = atof( line );
+		//fprintf (stderr, "%s --> %f\n", line, atof(line) );
+	}
+	fprintf( stderr, "------------------\n");
+	fclose( matFile );
+	fprintf( stderr, "Number of elements: %d\n", index );
+
+	*matrix = fileMatrix;
+	*vector = fileVector;
+}
+
+void tokenize( char *line, double *matrix, int* index){
+	char *sep = ", \n";
+	char *word;
+	char temp[MAX_LINE];
+
+	strncpy( temp, line, MAX_LINE );
+	for( word = strtok(temp, sep); word; word = strtok(NULL, sep) )
+		matrix[ (*index) ++ ] = atof( word );
+}
+
+void tokenize_count( char *line, int* index){
+	char *sep = ", \n";
+	char *word;
+	char temp[MAX_LINE];
+
+	strncpy( temp, line, MAX_LINE );
+	for( word = strtok(temp, sep); word; word = strtok(NULL, sep) )
+		(*index) ++;
+}
+
+void readVec( char *vectorPath, double **vector, int *N){
+
+	FILE *matFile;
+	char line[MAX_LINE];
+	int numLines = 0;
+	int index = 0;
+	double *fileVector;
+
+   if ( (matFile = fopen(vectorPath, "r")) == NULL ) { 
+		fprintf( stderr, "Error opening the pdb file!\n" );
+		exit( -1 );
+	}
+
+	while (!feof( matFile ) ){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, matFile );
+		if (line[0] == 0) break;
+		tokenize_count( line, &numLines );
+		break;
+	}
+	fprintf( stderr, " Number of lines read: %d \n", numLines );
+
+	*N = numLines;
+	fileMatrix = (double *) malloc( sizeof(double) * (numLines) );
+
+	//read the file here and fill the matrix. 
+	rewind( matFile );	
+	while (!feof( matFile )){
+		memset( line, 0, MAX_LINE );
+		fgets( line, MAX_LINE, matFile);
+		if (line[0] == 0) break;
+		tokenize( line, fileMatrix, &index );
+		break;
+	}
+
+	fclose( matFile );
+	fprintf( stderr, "Number of elements: %d\n", index );
+}
+#endif
diff --git a/code/cuda/RC-FINAL-5/readMatVec.h b/code/cuda/RC-FINAL-5/readMatVec.h
new file mode 100644
index 0000000..b299c33
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/readMatVec.h
@@ -0,0 +1,13 @@
+#ifndef __H_READ_MATRIX__
+#define __H_READ_MATRIX__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "ctype.h"
+
+void tokenize( char *, double *, int* );
+void readMatVec( char *, char *, double **, double **, int *);
+void readVec( char *, double **, int *);
+
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/softmax_multiclass.cu b/code/cuda/RC-FINAL-5/softmax_multiclass.cu
new file mode 100644
index 0000000..b588c30
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/softmax_multiclass.cu
@@ -0,0 +1,2208 @@
+#include "softmax_multiclass.h"
+#include "cuda_utils.h"
+
+#include "gen_random.h"
+#include "cuda_types.h"
+#include "print_utils.h"
+
+#include "classification_kernels.h"
+
+GLOBAL void ker_exp( real *results, int count)
+{
+	int idx = blockDim.x * blockIdx.x + threadIdx.x; 
+	if (idx < count)
+		results[idx] = exp( (real)idx );
+}
+
+void expTest( real *results, int count, real *host){
+
+	ker_exp <<< 1, count>>> (results, count);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+}
+
+__device__ __inline__ double my_shfl(double x, int lane)
+{
+        // Split the double number into 2 32b registers.
+        int lo, hi;
+        asm volatile( "mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(x));
+
+        // Shuffle the two 32b registers.
+        lo = __shfl_xor(lo, lane);
+        hi = __shfl_xor(hi, lane);
+
+        // Recreate the 64b number.
+        //asm volatile( "mov.b64 %0, {%1,%2};" : "=d(x)" : "r"(lo), "r"(hi));
+        //return x;
+        return __hiloint2double( hi, lo);
+}
+
+__device__ __inline__ double warpSum( double x )
+{
+        for (int offset = WARP_SIZE/2; offset > 0; offset /= 2)
+                x += my_shfl( x, offset);
+        return x;
+}
+
+
+GLOBAL void ker_add_regularizer ( real *input, real *vector, real lambda, int count, real normalizer)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	if (idx < count) input[ idx ] += lambda * vector[ idx ] ;
+}
+
+
+/*
+GLOBAL void reduce(const real *input, real *results, const size_t count) {
+        extern __shared__ real my_results[];
+        unsigned int lane = threadIdx.x >> 5;
+        unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+        real sdata;
+        real x = 0;
+
+        sdata = 0;
+        my_results[ lane ] = 0;
+        if(idx < count) x = input [idx];
+        sdata = x;
+
+        sdata = warpSum ( sdata );
+        if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata;
+        __syncthreads ();
+
+        if (blockDim.x/WARP_SIZE == 0)
+        	sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0;
+        else
+        	sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0;
+        __syncthreads ();
+
+        if (lane == 0) sdata = warpSum( sdata );
+        if(threadIdx.x == 0) results [ blockIdx.x  ] =  sdata;
+}
+
+*/
+
+GLOBAL void reduce_vector_warp( const real *input, const real *maxdots, real *results, const size_t numcomps, int numblocks )
+{
+	extern __shared__ real my_results[]; 
+
+	unsigned int lane  = threadIdx.x >> 5; 
+	unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 
+
+	real sdata; 
+        sdata = 0.;
+
+	if (idx < numcomps ){
+		for (int c = 0; c < numblocks; c ++) sdata += input [ c * numcomps + idx ]; 
+		results[ idx ] = sdata + exp( -1. * maxdots[ idx ] ); 
+	}
+}
+
+
+GLOBAL void reduce_vector_warp_mt( const real *input, const real *maxdots, real *results, const size_t numcomps, int numblocks )
+{
+	unsigned int col =  threadIdx.x >> 5; 
+        unsigned int myRowId = (blockDim.x * blockIdx.x + threadIdx.x) / WARP_SIZE; 
+
+	real sdata; 
+	real x = 0.;
+
+        sdata = 0.;
+	x = 0.0;
+	if ((col < numblocks) && (myRowId < numcomps)) x = input[(col * numcomps) + myRowId ];
+        sdata = x;
+	__syncthreads ();
+
+        sdata = warpSum ( sdata );
+        if ((col == 0) && (myRowId < numcomps))	
+		results [ myRowId ] = sdata + exp( -1 * maxdots[myRowId] );
+}
+
+
+GLOBAL void reduce_vector_mt( const real *input, real *results, const size_t numcomps, const real normalizer, int numblocks )
+{
+	extern __shared__ real my_results[]; 
+
+	unsigned int idx =  threadIdx.x; 
+        unsigned int lane = threadIdx.x >> 5;
+	unsigned int compOffset = blockIdx.x; 
+
+	real sdata; 
+	real x = 0.;
+
+	for (int i = compOffset; i < numcomps; i += gridDim.x){
+
+        	sdata = 0.;
+        	my_results[ lane ] = 0.;
+		x = 0.0;
+		if ((idx < numblocks) && (i < numcomps)) x = input[(idx * numcomps) + i ];
+        	sdata = x;
+		__syncthreads ();
+
+        	sdata = warpSum ( sdata );
+        	if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata;
+        	__syncthreads ();
+
+        	if (blockDim.x/WARP_SIZE == 0)
+        		sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0;
+        	else
+        		sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0;
+        	__syncthreads ();
+
+        	if (lane == 0) sdata = warpSum( sdata );
+        	if((threadIdx.x == 0) && (i < numcomps))  results [ i ] =  sdata * normalizer;
+        	__syncthreads ();
+	}
+}
+
+GLOBAL void reduce_vector(const real *input, real *results, const size_t numclasses, const size_t cols, const real normalizer, int numblocks)
+{
+        extern __shared__ real my_results[];
+        unsigned int lane = threadIdx.x >> 5;
+        unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+        real sdata;
+        real x = 0.;
+
+	for (int i = 0; i < numclasses * cols; i ++){
+        	sdata = 0.;
+        	my_results[ lane ] = 0.;
+		x = 0.0;
+		if (idx < numblocks) x = input[idx * numclasses * cols + i];
+        	sdata = x;
+		__syncthreads ();
+
+        	sdata = warpSum ( sdata );
+        	if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata;
+        	__syncthreads ();
+
+        	if (blockDim.x/WARP_SIZE == 0)
+        		sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0;
+        	else
+        		sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0;
+        	__syncthreads ();
+
+        	if (lane == 0) sdata = warpSum( sdata );
+        	if(threadIdx.x == 0) results [ i ] =  sdata * normalizer;
+        	__syncthreads ();
+	}
+}
+
+GLOBAL void reduce_log(const real *input, real *results, const size_t count) {
+        extern __shared__ real my_results[];
+        unsigned int lane = threadIdx.x >> 5;
+        unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+        real sdata;
+        real x = 0;
+
+        sdata = 0;
+        my_results[ lane ] = 0;
+        if(idx < count) x = log(input [idx] );
+        sdata = x;
+
+        sdata = warpSum ( sdata );
+        if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata;
+        __syncthreads ();
+
+        if (blockDim.x/WARP_SIZE == 0)
+        	sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0;
+        else
+        	sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0;
+        __syncthreads ();
+
+        if (lane == 0) sdata = warpSum( sdata );
+        if(threadIdx.x == 0) results [ blockIdx.x  ] =  sdata;
+}
+
+GLOBAL void ker_compute_expsum( real *XW, int rows, int cols, int numclasses, 
+			real *expSumVec, int threads_per_col)
+{
+	int myColId = ( blockIdx.x * blockDim.x + threadIdx.x ) % threads_per_col; 	
+	int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ) / threads_per_col; 	
+	
+	//local Data. 
+	real sdata = 0; 
+	
+	for (int i = myRowId; i < rows; i += gridDim.x * blockDim.x )
+	{
+		sdata = 0; 
+	
+		for (int j = myColId; j < cols; j ++ ) sdata += exp ( XW[ j * rows + i ] ); 
+
+		//warp sum here. 
+        	for (int offset = threads_per_col/2; offset > 0; offset /= 2) 
+			sdata += my_shfl( sdata, offset);
+
+		if (myColId == 0) expSumVec[ i ] = sdata; 
+	}
+}
+
+/*
+
+GLOBAL void ker_init_scaleTerms ( real *scaleTerms, int sampleSize, real *probs, int *indices )
+{
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (myRowId < sampleSize){
+                scaleTerms[ myRowId ] = probs[ indices[ myRowId ] ] ;
+        }
+}
+
+
+GLOBAL void ker_compute_probs( real *probs, int rows, int sampleSize, real *randVec, real *indices)
+{
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (myRowId < rows ){
+                probs[ myRowId ] *= sampleSize;
+                if (probs[ myRowId ] > 1.0) probs[ myRowId ] = 1.0;
+
+                if (randVec[ myRowId ] < probs[ myRowId ] )
+                        indices[ myRowId ] = 1;
+                else
+                        indices[ myRowId ] = 0;
+        }
+}
+
+*/
+
+GLOBAL void ker_compute_dHXW_nrm (real *dHXW, real *rowNrms, int rows, int numclasses)
+{
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (myRowId < rows)
+        {
+                for (int j = 0; j < numclasses; j += 1 ){
+                        dHXW[ j * rows + myRowId ] = abs( dHXW[ j * rows + myRowId ] * (1. - dHXW[ j * rows + myRowId ]) ) * rowNrms[ myRowId ];
+                }
+                for (int j = 1; j < numclasses; j += 1 ){
+                        dHXW[ myRowId ] += dHXW[ j * rows + myRowId ];
+                }
+        }
+}
+
+/*
+
+GLOBAL void ker_normalize (real *dHXW, int rows, real *nrmConstant, real *probs ){
+        int myRowId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (myRowId < rows){
+                probs[ myRowId ] = dHXW[ myRowId ] / nrmConstant[0];
+        }
+}
+
+GLOBAL void ker_row_norms( real *features, int rows, int numclasses, real *nrm )
+{
+        int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x );
+        int i = 0;
+        real sum = 0;
+
+        if (myRowId < rows) {
+                i = myRowId;
+                for (int j = 0; j < numclasses; j += 1)
+                        sum += pow( features[ j * rows + i ], 2.);
+
+                nrm[ i ] = sqrt( sum );
+        }
+}
+
+
+GLOBAL void ker_sqr_elements ( real *ptr, int nnz, int elems_per_thread, real *results )
+{
+        int myID = blockIdx.x * blockDim.x + threadIdx.x ;
+        int i = 0;
+
+        if (myID < nnz) {
+                i = myID;
+                //results[ i ] = ptr[ i ] * ptr[ i ];
+                ptr[ i ] *= ptr[ i ];
+        }
+
+}
+
+GLOBAL void ker_sqrt_elements (real *ptr, int count )
+{
+        int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x );
+        int i = 0;
+
+        if (myRowId < count ){
+                i = myRowId;
+                ptr[ i ] = sqrt( ptr[ i ] );
+        }
+}
+
+GLOBAL void ker_init_ones (real *ptr, int count )
+{       
+        int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x );
+        int i = 0;
+        
+        if (myRowId < count ){
+                i = myRowId;
+                ptr[ i ] = 1.0;
+        }
+}
+
+*/
+
+
+GLOBAL void ker_compute_HXW( real *XW, int rows, int cols, int numclasses, int threads_per_col )
+{
+	int myColId = ( blockIdx.x * blockDim.x + threadIdx.x ) % threads_per_col; 	
+	int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ) / threads_per_col; 	
+	int myWarpId = (blockIdx.x * blockDim.x + threadIdx.x ) % WARP_SIZE; 
+
+	real sdata = 0; 
+	int i = 0; 
+
+	real maxdot = 0; 
+
+	//for (int i = myRowId; i < rows; i += gridDim.x * blockDim.x){
+	if (myRowId < rows) {
+		i = myRowId;
+
+		maxdot = 0; 
+		for (int j = 0; j < numclasses; j += threads_per_col ) {
+			if (maxdot < XW[ j * rows + i ]) maxdot = XW[ j * rows + i ]; 
+		}
+
+		sdata = 0; 
+		for (int j = 0; j < numclasses; j += threads_per_col ) sdata += exp ( XW[ j * rows + i ] - maxdot ); 
+
+		//for (int offset = threads_per_col/2; offset > 0; offset /= 2) sdata += my_shfl( sdata, myWarpId + offset ); 
+
+		for (int j = 0; j < numclasses; j += threads_per_col ) 
+			XW[ j * rows + i ] = exp( XW[ j * rows + i ] - maxdot ) / (exp(-1. * maxdot) + sdata); 
+	}
+}
+		
+
+GLOBAL void ker_compute_fx (real *matvec, int rows, int cols, int numclasses, 
+				real *target, real *indicatorVal, int NUM_THREADS, real *maxdots )
+{
+	extern __shared__ real my_results[];
+
+	int idx =  blockIdx.x * blockDim.x + threadIdx.x; 
+	int myClrId = idx % NUM_THREADS; 
+	int myRowId = idx / NUM_THREADS; 
+        unsigned int lane = threadIdx.x >> 5;
+
+	real sdata = 0; 
+	real maxdot = 0; 
+	
+	//if (myRowId < rows) {
+	for (int r = myRowId; r < rows; r += gridDim.x * blockDim.x ) {
+		maxdot = 0; 
+		 for (int i = myClrId; i < numclasses; i += NUM_THREADS){
+			if (maxdot < matvec[ i * rows + r ]) maxdot = matvec[ i * rows + r]; 
+	 	 }
+
+		maxdots[ r ] = maxdot; 
+
+		 for (int i = myClrId; i < numclasses; i += NUM_THREADS){
+			if ((int)target[ r ] == (i + 1)) sdata += matvec[ i * rows + r ]; 
+			matvec[ i * rows + r ] = exp( matvec[ i * rows + r ]  - maxdot); 
+		 } 
+	}
+	__syncthreads (); 
+
+        sdata = warpSum ( sdata );
+        if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata ;
+        __syncthreads ();
+
+        if (blockDim.x/WARP_SIZE == 0)
+        	sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0;
+        else
+        	sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0;
+        __syncthreads ();
+
+        if (lane == 0) sdata = warpSum( sdata );
+        if(threadIdx.x == 0) indicatorVal [ blockIdx.x  ] =  sdata;
+}
+
+GLOBAL void ker_softmax (real *features, real *target, int rows, int cols, int num_classes, 
+			real *weights, real lambda, real *wspace )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	int lane = threadIdx.x >> 5; 
+
+	extern __shared__ real sh_vec[];
+	real dot = 0;
+	int myclass = 0; 
+
+	real blk_sum = 0; 
+	real psum = 0;
+
+	for (int i = 0; i < num_classes; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x];	
+		__syncthreads ();
+	
+		dot = 0;
+		if (idx  < rows ) {
+			for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j  * rows + idx ]; 	
+			psum += exp (dot);
+		}
+		__syncthreads ();
+	}
+	
+	// subtract the weights * feature for the class it belongs. 
+	if (idx < rows){ 
+		psum =  log( 1 + psum );
+		myclass = (int)(target[ idx ] - 1);
+	}
+	
+	if ( idx < rows && myclass < num_classes) {
+		dot = 0;
+		for (int j = 0; j < cols; j ++) 
+			dot += features[ j * rows + idx ] * weights[ myclass * cols + j ];
+		psum = psum - dot;
+	}
+	__syncthreads ();
+
+	// block reduction here. 
+        blk_sum  = warpSum( psum );
+        if (threadIdx.x % WARP_SIZE == 0) sh_vec[lane] = blk_sum;
+        __syncthreads ();
+
+        if (blockDim.x/WARP_SIZE == 0)
+        	blk_sum = (threadIdx.x < 1) ? sh_vec[threadIdx.x] : 0;
+        else
+        	blk_sum = (threadIdx.x < (blockDim.x / WARP_SIZE) ) ? sh_vec[ threadIdx.x ] : 0;
+        __syncthreads ();
+
+        if (lane == 0) blk_sum = warpSum( blk_sum );
+        if (threadIdx.x == 0) wspace[ blockIdx.x ] = blk_sum;
+}
+
+GLOBAL void ker_dx_softmax (real *features, real *target, int rows, int cols, int num_classes, 
+			real *weights, real lambda, real *wspace )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	int lane = threadIdx.x >> 5;
+	extern __shared__ real sh_vec[];
+
+	real numerator = 0.; 
+	real denominator = 0.; 
+	int indicator = 0; 
+	real multiplier = 0.; 
+	real blk_sum = 0.; 
+	real p_i = 0.; 
+
+	real maxdot = 0.;
+
+	if (idx < rows) indicator = (int)(target[ idx ] - 1.); 
+	__syncthreads ();
+
+        //maxdot here. 
+        for (int i = 0; i < num_classes; i ++){
+                if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x ];
+                __syncthreads ();
+
+                numerator = 0.;
+                if (idx < rows) {
+                        for (int j = 0; j < cols; j ++)
+                                numerator += sh_vec[j] * features[ j * rows + idx ];
+
+                        if (maxdot < numerator) maxdot = numerator;
+                }
+                __syncthreads ();
+        }
+
+
+	//denominator here. 
+	for (int i = 0; i < num_classes; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x ];	
+		__syncthreads ();
+	
+		numerator = 0.;
+		if (idx < rows) {
+			for (int j = 0; j < cols; j ++)
+				numerator += sh_vec[j] * features[ j * rows + idx ]; 	
+			denominator  += exp( numerator - maxdot );
+		}
+		__syncthreads ();
+	}
+
+	//numerator here. 
+	//dw_i (j) here. 
+	for (int i = 0; i < num_classes; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x];	
+		__syncthreads ();
+
+		numerator = 0; 
+		if ( idx < rows ){
+			for (int j = 0; j < cols; j ++)
+				numerator += sh_vec[j] * features[ j * rows + idx ]; 	
+			numerator = exp( numerator - maxdot );
+			//p_i = numerator / (1 + denominator); 
+			p_i = numerator / (exp(1. * maxdot) + denominator); 
+
+			if (i == indicator) multiplier = 1.0;
+			else multiplier = 0.;
+		}
+		__syncthreads ();
+
+		for (int j = 0; j < cols; j ++){ 
+			blk_sum = 0.; 
+			if (idx < rows)
+				blk_sum = (p_i - multiplier) * features[ j * rows + idx ];
+			
+        		__syncthreads ();
+
+			// block level reduction here. 
+        		blk_sum  = warpSum( blk_sum);
+        		if (threadIdx.x % WARP_SIZE == 0) sh_vec[lane] = blk_sum;
+        		__syncthreads ();
+
+        		if (blockDim.x/WARP_SIZE == 0)
+        			blk_sum = (threadIdx.x < 1) ? sh_vec[threadIdx.x] : 0;
+        		else
+        			blk_sum = (threadIdx.x < (blockDim.x / WARP_SIZE) ) ? sh_vec[ threadIdx.x ] : 0;
+        		__syncthreads ();
+
+        		if (lane == 0) blk_sum = warpSum( blk_sum );
+        		if (threadIdx.x == 0) wspace[ (blockIdx.x * num_classes * cols) +  ( i * cols + j )  ] = blk_sum;
+        		__syncthreads ();
+		}
+	}
+}
+
+GLOBAL void ker_dx_softmax_mt (real *features, real *target, int rows, int cols, int num_classes, 
+			real *weights, real lambda, real *XW, real *expSum, real *wspace, int threads_per_col)
+{
+	extern __shared__ real shmem[];
+
+	int idx = blockDim.x * blockIdx.x + threadIdx.x; 
+	int myIdx = idx / threads_per_col; 
+
+	real indicator = 0; 
+	real class_prob;
+
+	for (int clr = 0; clr < num_classes; clr ++){
+
+		for (int col = myIdx; col < cols; col += threads_per_col){
+
+			shmem[ myIdx ] = 0; 
+
+			for (int r = 0; r < rows; r += gridDim.x * blockDim.x ) {
+				class_prob = XW[ clr * rows + r ] / expSum[ r ];
+				if (clr == (target[ r ] - 1)) indicator = 1. ; 
+				shmem[ myIdx ] += (class_prob - indicator) * features[ col * rows + r ];
+			}
+
+			wspace[ blockIdx.x * num_classes * cols + clr * cols + col ] = shmem[ myIdx ]; 
+		}
+	}
+}
+
+GLOBAL void ker_dx_softmax_ind( real *hxw, real *target, int rows, int num_classes, real *result, int threads_per_row)
+{
+	int idx = blockDim.x * blockIdx.x + threadIdx.x; 
+	int myClrId = idx % threads_per_row; 	
+	int myRowId = idx / threads_per_row; 
+
+	real indicator = 0; 
+	int r = 0; 
+	
+	//for (int r = idx; r < rows; r += gridDim.x * blockDim.x){
+	
+	if (idx < rows ) {
+		r = idx; 
+		for (int clr = 0; clr < num_classes; clr ++ ){
+			result[ clr * rows + r ] = hxw[ clr * rows + r ];
+			if (clr == (int)(target[ r ] - 1.)) result[ clr * rows + r ] -= 1.; 
+
+			//result[ clr * rows + r ] = 0;
+			//if (clr == (int)(target[ r ] - 1.)) result[ clr * rows + r ] = 1; 
+		}
+	}
+}
+
+////Hessian functions here. 
+GLOBAL void ker_hx_Xv ( real *features, real *vector, int rows, int cols, int num_classes, real *A ) {
+
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	extern __shared__ real sh_vec[];
+
+	real dot = 0; 
+
+	for (int i = 0; i < num_classes; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = vector [i * cols + threadIdx.x ];	
+		__syncthreads ();
+	
+		if (idx < rows) {
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; 	
+			A[ idx + i * rows ] = dot;  // column major format here. 
+		}
+		__syncthreads ();
+	}
+}
+
+GLOBAL void ker_hx_ProbabilityTerms ( real *features, real *weights, int rows, int cols, int num_classes, real *B ) {
+
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+	extern __shared__ real sh_vec[];
+
+	real dot = 0;
+	real sumexp = 0;
+
+	//probability terms here. 
+	sumexp = 0; 
+	for (int i = 0; i < num_classes; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x];	
+		__syncthreads ();
+
+		if ( idx < rows ){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; 	
+			sumexp += exp( dot );
+		}
+		__syncthreads ();
+	}
+
+	for (int i = 0; i < num_classes; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x];	
+		__syncthreads ();
+
+		if ( idx < rows ){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; 	
+			B [ idx + i * rows ] = exp(dot) / (1 + sumexp); 
+		}
+		__syncthreads ();
+	}
+}
+
+GLOBAL void ker_hx_C_scale (real *A, real *B, real *C, int rows, int cols, int num_classes, real *scale )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	real sum = 0; 
+	if (idx < rows){
+		for (int i = 0; i < num_classes; i ++) 
+			sum += A[ idx + i * rows ] * B[ idx + i * rows ];
+
+		for (int i = 0; i < num_classes; i ++) 
+			C[ i * rows + idx ] = 
+			 	(1. / scale[ idx ]) * ( A[ idx + i * rows ] * B[ idx + i * rows ] - 
+				B[ idx + i * rows ] * sum );
+	}
+}
+
+GLOBAL void ker_hx_C (real *A, real *B, real *C, int rows, int cols, int num_classes )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	real sum = 0; 
+	if (idx < rows){
+		for (int i = 0; i < num_classes; i ++) 
+			sum += A[ idx + i * rows ] * B[ idx + i * rows ];
+
+		for (int i = 0; i < num_classes; i ++) 
+			C[ i * rows + idx ] = 
+			 	A[ idx + i * rows ] * B[ idx + i * rows ] - 
+				B[ idx + i * rows ] * sum ;
+	}
+}
+
+
+real softmax_multiclass_fx (SparseDataset *spfeatures, real *features, real *target, int rows, int cols, int num_classes, 
+				real *weights, real lambda, real *devPtr, real *hostPtr, real *pageLckPtr){
+	
+/*
+	ker_softmax <<< BLOCKS, BLOCK_SIZE, sizeof(real) * cols >>> 
+		(features, target, rows, cols, num_classes, weights, lambda, devPtr);
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+	reduce <<< 1, BLOCKS_POW_2, BLOCKS_POW_2 * sizeof(real) >>> 
+		( devPtr, pageLckPtr, BLOCKS );	
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+
+        cublasCheckError( cublasDnrm2( cublasHandle, num_classes * cols, weights, 1, &pageLckPtr[1])) ;
+	return (pageLckPtr[0]) + (lambda/2.0) * pow(pageLckPtr[1], 2.);
+*/	
+
+	
+	//matvec operation here. 
+        int power = 1;
+	real alpha; 
+	real beta; 
+	real *indicatorVal = devPtr + rows * num_classes; 
+	real *maxdots = indicatorVal + rows + BLOCKS_POW_2; 
+	real *alphax = maxdots + rows + BLOCKS_POW_2; 
+	int NUM_THREADS = 1;
+
+	alpha = 1.0; 
+	beta = 0; 
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                        rows, num_classes, cols, 
+                                        &alpha, features, rows,
+                                        weights, cols, &beta, devPtr, rows) );
+	} else {
+		cusparseCheckError (
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+					rows, num_classes, cols, spfeatures->nnz, 	
+					&alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, 
+					spfeatures->colPtr, weights, cols, &beta, devPtr, rows )
+			); 
+	}
+	//fprintf( stderr, "NUM CLASSES --- >%d \n", num_classes ); 
+	//fprintf( stderr, "Matvec: \n"); 
+	//printVector( devPtr, 20, NULL); 
+
+	ker_compute_fx <<< BLOCKS * NUM_THREADS, BLOCK_SIZE, WARP_SIZE * sizeof(real)  >>> 
+			( devPtr, rows, cols, num_classes, target, indicatorVal, NUM_THREADS, maxdots); 
+	cudaThreadSynchronize ();  
+	cudaCheckError (); 
+	//fprintf( stderr, "Exp matvec: ... \n"); 
+	//printVector( devPtr, 20, NULL);
+	//printVector( maxdots, 20, NULL); 
+
+
+	//reduce the maxdots here. 
+	reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE * sizeof (real) >>>
+		(maxdots, maxdots + rows, rows ); 
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+	//printVector (maxdots + rows, 20, NULL ); 
+
+	reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof( real ) >>> 
+	//reduce <<< 1, WARP_SIZE, WARP_SIZE * sizeof( real ) >>> 
+		(maxdots + rows, &pageLckPtr[3], BLOCKS ); 
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+	//fprintf( stderr, "Maxdot sum: ... %e \n", pageLckPtr[3]); 
+	
+	
+	// final value of the indicator
+	reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> 
+	//reduce <<< 1, WARP_SIZE, WARP_SIZE * sizeof(real) >>> 
+		( indicatorVal, &pageLckPtr[0], BLOCKS ); 
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+	//fprintf( stderr, "Indicator value: %e \n", pageLckPtr[0] ); 
+
+	/*
+	power = 1;
+        while (power < num_classes) power *= 2;
+
+	//compute the log par there. 
+	reduce_vector_mt <<< THREADS_PER_ROW, WARP_SIZE, WARP_SIZE * sizeof(real) >>> 
+		(devPtr, devPtr, rows, 1., num_classes);
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+	*/
+
+	//compute the log part here. 
+	int warp_blocks = ((rows * WARP_SIZE) / BLOCK_SIZE) + 
+				(((rows * WARP_SIZE) % BLOCK_SIZE == 0) ? 0 : 1); 
+
+	//reduce_vector_warp_mt <<< warp_blocks, BLOCK_SIZE  >>> 
+	reduce_vector_warp <<< BLOCKS, BLOCK_SIZE >>> 
+		(devPtr, maxdots, alphax, rows, num_classes ); 
+	cudaThreadSynchronize (); 
+	cudaCheckError (); 
+	//fprintf( stderr, " Reduce Warp: ....\n"); 
+	//printVector( alphax, 20, NULL); 
+	
+
+	//final log part here. 
+	reduce_log <<< BLOCKS, BLOCK_SIZE, WARP_SIZE* sizeof(real) >>> 
+	//reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE* sizeof(real) >>> 
+		//( devPtr, devPtr, rows ); 
+		( alphax, alphax + rows, rows ); 
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+	//printVector( alphax + rows, 20, NULL); 
+
+	reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> 
+	//reduce <<< 1, WARP_SIZE, WARP_SIZE * sizeof(real) >>> 
+		//( devPtr, &pageLckPtr[1], BLOCKS);	
+		( alphax + rows, &pageLckPtr[1], BLOCKS);	
+	cudaThreadSynchronize ();
+	cudaCheckError ();
+	//fprintf( stderr, "Log part: %e \n", pageLckPtr[1] ); 
+
+	//return pageLckPtr[1]; 
+
+        cublasCheckError( cublasDnrm2( cublasHandle, num_classes * cols, weights, 1, &pageLckPtr[2])) ;
+	return (pageLckPtr[3] + pageLckPtr[1]) - pageLckPtr[0] + (lambda/2.0) * pow(pageLckPtr[2], 2.);
+	
+}
+
+//the result is a vector in here. 
+void softmax_multiclass_gx (real *features, real *target, int rows, int cols, 
+				int num_classes, real *weights, real lambda, real *gradient, 
+				real *devPtr, real *hostPtr, real *pageLckPtr)
+{
+	ker_dx_softmax <<<BLOCKS, BLOCK_SIZE, cols * sizeof(real) >>> 
+		(features, target, rows, cols, num_classes, weights, lambda, devPtr);
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+	/*
+	reduce_vector <<<1, BLOCKS_POW_2, (BLOCKS_POW_2/WARP_SIZE) * sizeof (real)  >>> 
+		(devPtr, gradient, num_classes, cols, 1., BLOCKS );
+	*/
+	
+	//int maxcomps = num_classes * cols + (num_classes * cols) % THREADS_PER_ROW ;
+	int maxcomps = num_classes * cols ;
+	reduce_vector_mt <<< THREADS_PER_ROW, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> 
+		(devPtr, gradient, maxcomps, 1., BLOCKS );
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+	if (lambda != 0) {
+		pageLckPtr[0] = lambda ;
+        	cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &pageLckPtr[0], weights, 1, gradient, 1) );
+	}
+}
+
+// build the hessian here. 
+void softmax_multiclass_hx (real *features, int rows, int cols, int num_classes, 
+				real *weights, real *vector, real lambda, 
+				real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B, int computeB)
+{
+	/*
+	real *A = devPtr; 
+	real *B = A + rows * num_classes;
+	real *C = B + rows * num_classes; 
+	*/
+	real *A = devPtr; 
+	real *C = A + rows * num_classes; 
+
+	real *alpha = pageLckPtr; 
+	real *beta = alpha + 1;
+
+	//compute A here. 	
+	ker_hx_Xv <<< BLOCKS, BLOCK_SIZE, cols * sizeof(real)  >>> 
+		(features, vector, rows, cols, num_classes, A); 
+	cudaThreadSynchronize ();
+	cudaCheckError (); 
+
+	//Compute B Here. 
+	if (computeB >= 1) {
+		ker_hx_ProbabilityTerms <<<BLOCKS, BLOCK_SIZE, cols * sizeof(real) >>>
+		(features, weights, rows, cols, num_classes, B); 
+		cudaThreadSynchronize ();
+		cudaCheckError (); 
+	}
+	
+
+	//Compute C Here. 
+	ker_hx_C <<< BLOCKS, BLOCK_SIZE >>>
+		(A, B, C, rows, cols, num_classes); 
+	cudaThreadSynchronize ();
+	cudaCheckError (); 
+
+	//Compute the final Matvec Here. 
+	*alpha = 1.0; 
+	*beta = 0; 
+        cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
+                                        cols, num_classes, rows,
+                                        alpha, features, rows,
+                                        C, rows, beta, Hv, cols ) );
+
+/*
+
+	*alpha = 1./(real)(num_classes * rows);
+	cublasCheckError (cublasDscal( cublasHandle, num_classes * cols, alpha, Hv, 1) );
+*/
+
+
+	if (lambda != 0) {
+		int rblocks = ((num_classes * cols) / BLOCK_SIZE) + 
+				(((num_classes * cols) % BLOCK_SIZE == 0) ? 0 : 1 ); 
+
+		//ker_add_regularizer <<< BLOCKS, BLOCK_SIZE >>>
+		//(Hv, vector, lambda, num_classes * cols, 1./ (real)rows ); 
+		ker_add_regularizer <<< rblocks, BLOCK_SIZE >>>
+		(Hv, vector, lambda, num_classes * cols, 1. ); 
+		//(Hv, vector, lambda, num_classes * cols, 1./ ((real)rows * num_classes) ); 
+		cudaThreadSynchronize (); 
+		cudaCheckError ();
+	}
+}
+
+
+///////////////////////
+//OPTIMIZED CODE HERE
+///////////////////////
+int generateNonUniformSample( real *probs, real *scaleTerms, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr)
+{
+        int count = 0;
+        real *devIndices = devPtr + rows;
+
+        getRandomVector( rows, NULL, devPtr);
+
+        ker_compute_probs <<< BLOCKS, BLOCK_SIZE >>>
+                        ( probs, rows, sampleSize, devPtr, devIndices );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        copy_host_device( hostPtr, devIndices, sizeof(real) * rows,
+                                                cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST);
+
+        for (int i = 0; i < rows; i ++){
+                if (hostPtr[i] != 0)
+                        selIndices[ count ++] = i;
+        }
+
+        //prepare scaleTerms here. 
+        cuda_memset( scaleTerms, 0, sizeof(real) * rows, 0x99 );
+        cuda_memset( devIndices, 0, sizeof(real) * rows, 0x99 );
+        copy_host_device( selIndices, devIndices, sizeof(int) * count,
+                                        cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+
+        int blocks = count / BLOCK_SIZE +
+                        ((count % BLOCK_SIZE) == 0 ? 0 : 1 );
+        ker_init_scaleTerms <<< blocks, BLOCK_SIZE >>>
+                        ( scaleTerms, count, probs, (int *)devIndices );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        return count;
+}
+
+void computeRowProbabilities( SparseDataset *spfeatures, real *features, int rows, int cols, int numclasses,
+                        real *dHXW, real *rowNrms, real *probs, real *devPtr )
+{
+        ker_compute_dHXW_nrm <<< BLOCKS, BLOCK_SIZE >>>
+                ( dHXW, rowNrms, rows, numclasses);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //reduce to compute the sum
+        reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE * sizeof (real) >>>
+                (dHXW, devPtr, rows );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof (real) >>>
+                (devPtr, devPtr + BLOCK_SIZE, BLOCKS);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        ker_normalize <<< BLOCKS, BLOCK_SIZE >>>
+                (dHXW, rows, devPtr + BLOCK_SIZE, probs );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+}
+
+void computeRowNorms( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr )
+{
+        if (features != NULL) {
+                ker_row_norms <<< BLOCKS, BLOCK_SIZE >>>
+                        ( features, rows, cols, rowNrms );
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+        } else {
+                cudaMemcpy( spfeatures->valPtr, spfeatures->sortedVals,
+                                sizeof(real) * spfeatures->nnz, cudaMemcpyDeviceToDevice );
+
+                int blocks = spfeatures->nnz / (BLOCK_SIZE) +
+                                ((spfeatures->nnz % (BLOCK_SIZE)) == 0 ? 0 : 1 );
+                ker_sqr_elements <<< blocks, BLOCK_SIZE >>>
+                        (spfeatures->valPtr, spfeatures->nnz, 1, devPtr);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                //matvec here. for row sums
+                real alpha = 1.0;
+                real beta = 0;
+
+                //init the vector here. 
+                blocks = cols / BLOCK_SIZE + (( cols % BLOCK_SIZE == 0) ? 0 : 1 );
+                ker_init_ones <<< blocks, BLOCK_SIZE >>>
+                                ( devPtr , cols );
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                cudaMemset( rowNrms, 0, sizeof(real) * rows );
+                cusparseCheckError(
+                        cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                        rows, cols, spfeatures->nnz,
+                                        &alpha, spfeatures->descr, spfeatures->valPtr, spfeatures->rowCsrPtr,
+                                        spfeatures->colPtr, devPtr, &beta, rowNrms)
+                                );
+                ker_sqrt_elements  <<< BLOCKS, BLOCK_SIZE >>>
+                                ( rowNrms, rows);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+        }
+}
+
+
+void computeHXW (SparseDataset *spfeatures, real *features, int rows, int cols, int num_classes, 
+		real *weights, real *XW, int subSampling ) {
+
+
+	real alpha; 
+	real beta; 
+
+	alpha = 1.0; 
+	beta = 0; 
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                        rows, num_classes, cols, 
+                                        &alpha, features, rows,
+                                        weights, cols, &beta, XW, rows) );
+	} else {
+		cusparseCheckError (
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+					rows, num_classes, cols, spfeatures->nnz, 	
+					&alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, 
+					spfeatures->colPtr, weights, cols, &beta, XW, rows )
+			); 
+	}
+
+
+	if (subSampling >= 1){
+		int blocks = rows / BLOCK_SIZE + (((rows % BLOCK_SIZE) == 0) ? 0 : 1 ); 
+		ker_compute_HXW <<< blocks, BLOCK_SIZE >>> 
+			( XW, rows, cols, num_classes, 1); 
+	} else {
+		ker_compute_HXW <<< BLOCKS, BLOCK_SIZE >>> 
+			( XW, rows, cols, num_classes, 1); 
+	}
+	cudaThreadSynchronize (); 
+	cudaCheckError ();
+
+/*
+	ker_hx_ProbabilityTerms <<<BLOCKS, BLOCK_SIZE, cols * sizeof(real) >>>
+		(features, weights, rows, cols, num_classes, XW); 
+	cudaThreadSynchronize ();
+	cudaCheckError (); 
+*/
+}
+
+
+void computeExpSum( real *XW, int rows, int cols, int num_classes, real *expSumVec )
+{
+	ker_compute_expsum <<< BLOCKS, BLOCK_SIZE >>> 
+		( XW, rows, cols, num_classes, expSumVec, 1 ); 
+	cudaThreadSynchronize (); 
+	cudaCheckError ();
+}
+
+void softmax_multiclass_gx_subsampled (SparseDataset *spfeatures, real *features, real *target, int rows, int cols, int num_classes, 
+			real *weights, real lambda, real *gradient, real *devPtr, real *hostPtr, real *pageLckPtr, 
+			SparseDataset *spGradientSample, real *gradientDataset, SparseDataset *spSampledGradientTrain, 
+			real *gradientLabels, int sampleSize, int samplingType)
+{
+	real *HXW = devPtr;
+	real *hxwInd = HXW + rows * num_classes; 
+
+	int blocks; 
+	real alpha; 
+	real beta; 
+
+	//computeHXW Here. 
+	computeHXW( spSampledGradientTrain, gradientDataset, sampleSize, cols, num_classes, weights, HXW, 1 ); 
+
+	blocks = sampleSize / BLOCK_SIZE + ((( sampleSize % BLOCK_SIZE ) == 0) ? 0 : 1 ); 
+	ker_dx_softmax_ind <<< blocks, BLOCK_SIZE >>> 
+			//(HXW, target, sampleSize, num_classes, hxwInd, 1); 
+			(HXW, gradientLabels, sampleSize, num_classes, hxwInd, 1); 
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+	//compute the gradient here. 
+	alpha = 1.0; 
+	beta = 0; 
+
+	//perform the X^T * HXWIND
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
+                                        cols, num_classes, sampleSize,
+                                        &alpha, gradientDataset, sampleSize,
+                                        hxwInd, sampleSize, &beta, gradient, cols ) );
+	} else {
+		cusparseCheckError( 
+			cusparseDcsrmm ( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE,
+					sampleSize, num_classes, cols, 
+					spSampledGradientTrain->nnz, &alpha, 
+					spSampledGradientTrain->descr, spSampledGradientTrain->sortedVals, spSampledGradientTrain->rowCsrPtr, 
+					spSampledGradientTrain->colPtr, hxwInd, sampleSize, &beta, gradient, cols ) ); 
+	}
+
+
+        //non-uniform subsampling part here. 
+	if (samplingType == 2) {
+        	alpha = ((real)rows)/((real)sampleSize);
+        	cublasCheckError( cublasDscal( cublasHandle, num_classes * cols, &alpha, gradient, 1) );
+	} else if (samplingType == 1){
+		alpha = ((real) rows) / ((real) sampleSize ); 
+		cublasCheckError( cublasDscal( cublasHandle, num_classes * cols, &alpha, gradient, 1 ));
+	}
+
+	//regularizer here. 
+        cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &lambda, weights, 1, gradient, 1) );
+}
+
+
+void softmax_multiclass_gx_optimized (SparseDataset *spfeatures, real *features, real *target, int rows, int cols, int num_classes, 
+			real *weights, real lambda, real *HXW, real *gradient, 
+			real *devPtr, real *hostPtr, real *pageLckPtr)
+{
+	/*
+	ker_dx_softmax_mt <<<BLOCKS * THREADS_PER_ROW, BLOCK_SIZE, cols * sizeof(real) >>> 
+		(features, target, rows, cols, num_classes, weights, lambda, 
+			HXW, expSumVec, devPtr);
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+	//reduce across all blocks here. 
+	int maxcomps = num_classes * cols ;
+	reduce_vector_mt <<< THREADS_PER_ROW, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> 
+		(devPtr, gradient, maxcomps, 1., BLOCKS );
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+	//regularizer here. 
+	if (lambda != 0) {
+		pageLckPtr[0] = lambda ;
+        	cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &pageLckPtr[0], weights, 1, gradient, 1) );
+	}
+	*/
+
+	cuda_memset( gradient, 0, sizeof(real) * num_classes * cols, ERROR_MEM_SET ); 
+
+	real alpha; 
+	real beta; 
+	real *hxwInd = devPtr; 
+
+	real dxnrm;
+	real gxnrm; 
+
+	ker_dx_softmax_ind <<< BLOCKS , BLOCK_SIZE >>> 
+			(HXW, target, rows, num_classes, hxwInd, 1); 
+
+	cudaDeviceSynchronize (); 
+	cudaThreadSynchronize (); 
+	cudaCheckError ();	
+
+        //cublasCheckError( cublasDnrm2( cublasHandle, rows, hxwInd + rows, 1, &dxnrm));
+	//fprintf( stderr, "Norm of the Hxwind matrix is : %f \n", dxnrm * dxnrm ); 
+	//printVector( hxwInd, 100, NULL ); 
+	//printVector( target, 1000, NULL ); 
+	//printVector( target, rows, NULL ); 
+
+	//compute the gradient here. 
+	alpha = 1.0; 
+	beta = 0; 
+
+	//perform the X^T * HXWIND
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
+                                        cols, num_classes, rows,
+                                        &alpha, features, rows,
+                                        hxwInd, rows, &beta, gradient, cols ) );
+	} else {
+		//fprintf( stderr, "Spfeatures nnz: %d \n", spfeatures->nnz ); 
+		/*
+		cusparseCheckError ( 
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+					rows, num_classes, cols, spfeatures->nnz, 	
+					&alpha, spfeatures->descr, spfeatures->valPtr, spfeatures->rowCsrPtr, 
+					spfeatures->colPtr, HXW, rows, &beta, gradient, cols) ); 
+		*/
+
+		cusparseCheckError( 
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+					rows, num_classes , cols, spfeatures->nnz, 
+					&alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, 
+					spfeatures->colPtr, hxwInd, rows, &beta, gradient, cols ) ); 
+		
+		//sparse matvec here. 
+		/*
+		cusparseCheckError( 
+			cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+					cols, rows, spfeatures->nnz, 
+					&alpha, spfeatures->descr, spfeatures->cscValPtr, spfeatures->cscColPtr, 
+					spfeatures->cscRowPtr, hxwInd, &beta, gradient )
+				);
+			cudaDeviceSynchronize (); 
+		*/
+
+		//writeVector( hxwInd, rows, "first_column.txt", 0 ); 
+			
+
+		//printVector( gradient, 20, NULL ); 
+        	//cublasCheckError( cublasDnrm2( cublasHandle, num_classes * cols, gradient, 1, &gxnrm));
+		//fprintf ( stderr, "Gx norm: %f \n", gxnrm ); 
+					
+	}
+
+	//regularizer here. 
+        cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &lambda, weights, 1, gradient, 1) );
+}
+
+void softmax_multiclass_hx_subsampled(SparseDataset *spfeatures, real *features, int rows, int cols, int num_classes, 
+				real *weights, real *vector, real lambda, 
+				real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *HXW, 
+				SparseDataset *sampledfeatures, real *sampledDataset, 
+				SparseDataset *spSampledHessianTrainSet, int sampleSize, real *scaleTerms, int samplingType)
+{
+	real *A = devPtr; 
+	real *B = A + sampleSize * num_classes; 
+	real *C = B + sampleSize * num_classes; 
+
+	real alpha,beta;
+
+	//compute  A = XV
+	alpha = 1; 
+	beta = 0; 
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                        sampleSize, num_classes, cols, 
+                                        &alpha, sampledDataset, sampleSize,
+                                        vector, cols, &beta, A, sampleSize) );
+	} else {
+		cusparseCheckError (
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+					sampleSize, num_classes, cols, spSampledHessianTrainSet->nnz, 	
+					&alpha, spSampledHessianTrainSet->descr, spSampledHessianTrainSet->sortedVals, 
+					spSampledHessianTrainSet->rowCsrPtr, spSampledHessianTrainSet->colPtr, 
+					//vector, cols, &beta, A, rows )
+					vector, cols, &beta, A, sampleSize)
+			); 
+
+		//FIXED-subsampling issue
+	}
+
+	//compute B here. for sub sample part of the feautre matrix here. 
+	computeHXW( spSampledHessianTrainSet, sampledDataset, sampleSize, cols, num_classes, weights, B, 1 ); 
+
+	//Compute C Here. 
+	//ker_hx_C <<< BLOCKS, BLOCK_SIZE >>>
+	int blocks = sampleSize / BLOCK_SIZE + (((sampleSize % BLOCK_SIZE) == 0) ? 0 : 1); 
+	if (samplingType == 2) {
+		ker_hx_C_scale <<< blocks, BLOCK_SIZE >>>
+			(A, B, C, sampleSize, cols, num_classes, scaleTerms); 
+	} else {
+		ker_hx_C <<< blocks, BLOCK_SIZE >>>
+			(A, B, C, sampleSize, cols, num_classes); 
+	}
+	
+	cudaThreadSynchronize ();
+	cudaCheckError (); 
+
+	//Compute the final Matvec Here. 
+	alpha = 1.0; 
+	beta = 0; 
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
+                                        cols, num_classes, sampleSize,
+                                        &alpha, sampledDataset, sampleSize,
+                                        C, sampleSize, &beta, Hv, cols ) );
+	} else {
+		cusparseCheckError (
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+					sampleSize, num_classes, cols, spSampledHessianTrainSet->nnz, 	
+					&alpha, spSampledHessianTrainSet->descr, spSampledHessianTrainSet->sortedVals, 
+					spSampledHessianTrainSet->rowCsrPtr, spSampledHessianTrainSet->colPtr, 
+					//C, rows, &beta, Hv, cols)
+					C, sampleSize, &beta, Hv, cols)
+			); 
+
+		//FIXED subsampling issue
+	}
+
+	if (samplingType == 1) {
+		//scale everything here. 
+		alpha = ( ((real) rows) / ((real) sampleSize)); 
+        	cublasCheckError( cublasDscal( cublasHandle, num_classes * cols, &alpha, Hv, 1) );
+	}
+
+	if (lambda != 0) {
+		int rblocks = ((num_classes * cols) / BLOCK_SIZE) + 
+				(((num_classes * cols) % BLOCK_SIZE == 0) ? 0 : 1 ); 
+
+		ker_add_regularizer <<< rblocks, BLOCK_SIZE >>>
+		(Hv, vector, lambda, num_classes * cols, 1. ); 
+		cudaThreadSynchronize (); 
+		cudaCheckError ();
+	}
+
+}
+
+void softmax_multiclass_hx_optimized (SparseDataset *spfeatures, real *features, int rows, int cols, int num_classes, 
+				real *weights, real *vector, real lambda, 
+				real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B )
+{
+	real *A = devPtr; 
+	real *C = A + rows * num_classes; 
+
+	real alpha,beta;
+
+	//compute  A = XV
+	alpha = 1; 
+	beta = 0; 
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                        rows, num_classes, cols, 
+                                        &alpha, features, rows,
+                                        vector, cols, &beta, A, rows) );
+	} else {
+		cusparseCheckError (
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+					rows, num_classes, cols, spfeatures->nnz, 	
+					&alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, 
+					spfeatures->colPtr, vector, cols, &beta, A, rows )
+			); 
+	}
+
+	//Compute C Here. 
+	ker_hx_C <<< BLOCKS, BLOCK_SIZE >>>
+		(A, B, C, rows, cols, num_classes); 
+	cudaThreadSynchronize ();
+	cudaCheckError (); 
+
+	//Compute the final Matvec Here. 
+	alpha = 1.0; 
+	beta = 0; 
+	if (features) {
+        	cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
+                                        cols, num_classes, rows,
+                                        &alpha, features, rows,
+                                        C, rows, &beta, Hv, cols ) );
+	} else {
+		cusparseCheckError (
+			cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, 
+					rows, num_classes, cols, spfeatures->nnz, 	
+					&alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, 
+					spfeatures->colPtr, C, rows, &beta, Hv, cols)
+			); 
+	}
+
+	if (lambda != 0) {
+		int rblocks = ((num_classes * cols) / BLOCK_SIZE) + 
+				(((num_classes * cols) % BLOCK_SIZE == 0) ? 0 : 1 ); 
+
+		ker_add_regularizer <<< rblocks, BLOCK_SIZE >>>
+		(Hv, vector, lambda, num_classes * cols, 1. ); 
+		cudaThreadSynchronize (); 
+		cudaCheckError ();
+	}
+
+}
+
+////////////////////
+//DONE HERE
+////////////////////
+
+
+
+GLOBAL void ker_softmax_predict( real *test_set, real *weights, 
+				int rows, int cols, int numclasses, real *workspace)
+{
+	extern __shared__ real sh_vec[];
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	real dot = 0;
+	real sumexp; 
+	real sumprob; 
+
+	//probability terms here. 
+	sumexp = 0; 
+	for (int i = 0; i < numclasses; i ++){
+		if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x];	
+		__syncthreads ();
+
+		if ( idx < rows ){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += sh_vec[j] * test_set[ j * rows + idx ]; 	
+			sumexp += exp( dot );
+		}
+		__syncthreads ();
+	}
+
+	for (int c = 0; c < numclasses; c ++) {
+		if (threadIdx.x < cols) sh_vec[ threadIdx.x ] = weights[ c * cols + threadIdx.x ];
+		__syncthreads ();
+		
+		if (idx < rows){
+			dot = 0.; 
+			for (int i = 0; i < cols; i ++) dot += test_set[i * rows + idx] * sh_vec[i];
+			workspace[ idx * numclasses + c ] = exp(dot) / (1 + sumexp);
+		}
+		__syncthreads ();
+	}
+}
+
+real softmax_predict(SparseDataset *spTest, real *test_set, real *test_labels, real *weights, int rows, int cols, int numclasses, 
+			real *hostWorkspace, real *devWorkspace, int computeDevice, real *h_test_set)
+{
+	int pblocks =  (rows / BLOCK_SIZE) + 
+			((rows % BLOCK_SIZE) == 0 ? 0  : 1 );
+	real pmax = 0;
+	real matches = 0; 
+	real nomatches = 0;
+	int pclass = -1;
+	real sumprob;
+	real dot, sumexp, maxdot; 
+
+	real *h_weights = hostWorkspace; 
+	real *temp = h_weights + numclasses * cols; 
+
+//	fprintf( stderr, "ROWS -----> %d, COLS --------> %d, CLASSES ------> %d \n", rows, cols, numclasses );
+
+	if (computeDevice == 1) {
+		/*
+		ker_softmax_predict <<< pblocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> 
+			( test_set, weights, rows, cols, numclasses, devWorkspace);
+		cudaThreadSynchronize (); 
+		cudaCheckError ();
+		*/
+		computeHXW( spTest, test_set, rows, cols, numclasses, weights, devWorkspace, 0 ); 
+
+		copy_host_device( temp, devWorkspace, sizeof(real) * numclasses * rows, 
+				cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST );
+	} else {
+
+		copy_host_device( h_weights, weights, sizeof(real) * numclasses * cols, 
+				cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST );
+
+		for (int i = 0; i < rows; i ++) {
+			sumexp = 0; 	
+			for (int c = 0; c < numclasses; c ++) {
+				dot = 0; 
+				for (int j = 0; j < cols; j ++) dot += h_test_set[ j * rows + i ] * h_weights[ c * numclasses + j ];
+				sumexp += exp ( dot ); 
+			}
+			sumexp += 1.;
+
+			for (int c = 0; c < numclasses; c ++) {
+				dot = 0; 
+				for (int j = 0; j < cols; j ++) dot += h_test_set[ j * rows + i ] * h_weights[ c * numclasses + j ];
+				temp[ i * numclasses + c ] = exp( dot ) / sumexp; 
+			}
+		}
+	}
+
+#ifdef __debug__
+//	fprintf(stderr, " ---------- Class Probabilities ---------\n");
+#endif
+
+
+	// classify here, 
+	// Which ever probability is maximum
+
+/*
+	int counters[numclasses+1], true_counters[numclasses + 1]; 
+	memset( counters, 0, sizeof(int) * (numclasses + 1) ); 
+	memset( true_counters, 0, sizeof(int) * (numclasses + 1) ); 
+*/
+
+
+	for (int i = 0; i < rows; i ++){
+		
+		pmax = 0; 
+		pclass = -1;
+		sumprob = 0; 
+		for (int c = 0; c < numclasses; c ++){
+
+			sumprob += temp[ c * rows + i ];
+			if (pmax < temp[ c * rows + i ]){
+				pmax = temp[c * rows + i]; 
+				pclass = c + 1; 
+			}
+		}
+		
+		/*	
+		if (pclass < 0) {
+			fprintf( stderr, " Error in predicting classes ..... \n"); 
+			exit(-1); 
+		}
+		*/
+		
+/*
+		true_counters[ (int)(test_labels[i]-1) ] ++; 
+		if (pmax <= (1.- sumprob))
+			counters[numclasses] ++; 
+		else
+			counters[ pclass - 1 ] ++;
+*/
+
+
+		/*
+		if ( ((pmax <= (1. - sumprob)) && (test_labels[i] == (numclasses + 1))) ||
+			(pclass == (int)(test_labels[i])) ){
+			matches ++; 
+		}
+		*/
+		if ((pmax <= (1. - sumprob)) && (test_labels[i] == (numclasses + 1))){ 
+			matches ++; 
+		} else if ((pmax > (1. - sumprob)) && (pclass == (int)(test_labels[i])) ) {
+			matches ++; 
+		} else 
+			nomatches ++; 
+ 		 
+		//for (int c = 0; c < numclasses; c ++) fprintf( stderr, " %e ", temp[ c * rows + i] ); 
+		//fprintf( stderr, "\n");
+	}	
+
+
+/*
+
+	for (int i = 0; i < numclasses + 1; i ++) 
+		fprintf( stderr, " Class: %d ---> Predicted: %d, TrueCount: %d \n", i + 1, counters[i], true_counters[i] );
+
+	fprintf( stderr, "Total matches -----> %f, %d, %f \n", matches, rows, nomatches );
+*/
+	
+	//return ((real)matches/(real)rows) * 100.; 
+	return (matches/(matches + nomatches)) * 100.; 
+}
+
+void computeErrors ( real *features, real *target, int rows, int cols, int numclasses,
+			real *devPtr, real *hostPtr, real *pageLckPtr, int numpoints)
+{
+	int offset = numclasses * cols % 4; 
+	int count; 
+
+	real *constPoint = hostPtr;
+	real *hostPoint = constPoint + numclasses * cols + offset; 
+	real *dx = hostPoint + numclasses * cols + offset; 
+	real *ferror = dx + numclasses * cols + offset;
+	real *herror = ferror + numpoints;
+	real *dxs = herror + numpoints;
+	real *nextHostPtr = dxs + numpoints;
+
+        real *devPoint = devPtr;
+	real *devDx = devPoint + numclasses * cols + offset;
+        real *gradient = devDx + numclasses * cols + offset;
+        real *Hv= gradient + numclasses * cols + offset;
+        real *devConstPoint = Hv + numclasses * cols + offset;
+	real *B = devConstPoint + numclasses * cols+ offset;
+
+	//real *nextDevPtr = devConstPoint + numclasses * cols + offset; 
+	real *nextDevPtr = B+ numclasses * rows+ offset; 
+
+	real *vv = pageLckPtr; 
+	real *vhv = vv + 1;
+	real *dxnrm = vhv + 1;
+	real *nextPagePtr = dxnrm + 1;
+
+	real f;
+	real f0;
+	real lambda = 0.;
+	
+	real alpha, beta;
+
+	fprintf( stderr, "Number of random numbers to be generated: %d \n", numclasses * cols );
+
+	memset( constPoint, 0, sizeof(real) * numclasses * cols );
+	for (int i = 0; i < numclasses * cols; i ++)	constPoint[i] = 0.;
+
+	copy_host_device( constPoint, devPoint, sizeof(real) * numclasses * cols, 
+				cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+	copy_host_device( constPoint, devConstPoint, sizeof(real) * numclasses * cols, 
+				cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+
+	getRandomVector( numclasses * cols, dx, nextDevPtr);
+	//for (int i = 0; i < numclasses * cols; i ++)	dx[i] = 1.;
+	//count = readVector( dx, numclasses * cols, "dx_forest.txt");
+	//fprintf( stderr, "Read the random vector from file as: %d \n", count ); 
+	
+	//printHostVector( dx, numclasses * cols );
+
+	//f0
+        f0 = softmax_multiclass_fx (NULL, features, target, rows, cols, numclasses, 
+				devPoint, lambda, nextDevPtr, nextHostPtr, nextPagePtr);
+
+	//g0
+        softmax_multiclass_gx (features, target, rows, cols,
+                                numclasses, devPoint, lambda, gradient,
+                                nextDevPtr, nextHostPtr, nextPagePtr);
+	fprintf( stderr, "Gradient of the Softmax function is .... \n");
+	//printVector( gradient, numclasses * cols, NULL );
+
+/*
+	softmax_multiclass_hx (features, rows, cols, numclasses, 
+		devConstPoint, devConstPoint, 0, nextDevPtr, nextHostPtr, nextPagePtr, Hv );
+	printVector( Hv, numclasses * cols, NULL ); 
+*/
+
+	fprintf( stderr, "Starting the derivative test .. %f\n", f0);
+
+        for (int i = 0; i < numpoints; i ++) {
+
+                for (int j = 0; j < numclasses*cols; j ++) hostPoint[j] = constPoint[j] + dx[j];
+
+                copy_host_device( hostPoint, devPoint, sizeof(real) * numclasses * cols,
+                                cudaMemcpyHostToDevice, ERROR_MEMCPY_DEVICE_HOST);
+		copy_host_device( dx, devDx, sizeof(real) * numclasses * cols, 
+					cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+
+		//function evaluation here.
+        	f = softmax_multiclass_fx (NULL, features, target, rows, cols, numclasses, 
+				devPoint, lambda, nextDevPtr, nextHostPtr, nextPagePtr);
+
+		//first order error
+        	cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, gradient, 1, devDx, 1, vv) );
+		ferror[i] = f - (f0 + *vv);
+
+		//second order error
+		softmax_multiclass_hx (features, rows, cols, numclasses, 
+					devConstPoint, devDx, 0, nextDevPtr, nextHostPtr, nextPagePtr, Hv, B, 1 );
+		*vhv= 0;
+        	cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, devDx, 1, Hv, 1, vhv) );
+
+		//herror[i] = f - (f0 + *vv + (0.5 * (*vhv)) / (real)rows );
+		herror[i] = f - (f0 + *vv + 0.5 * (*vhv) );
+	
+		fprintf( stderr, "%d: f --> %e, vv --> %e, vhv--> %e, ferr: %e, herr: %e \n", 
+					i, f, *vv, *vhv, ferror[i], herror[i] );
+
+//exit(-1); 
+		//dxs here. 
+		*dxnrm = 0;
+        	cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, devDx, 1, dxnrm));
+		dxs[i] = *dxnrm;
+
+                for (int j = 0; j < numclasses*cols; j ++) dx[j] = dx[j] / 2.0;
+		//break;
+	}
+
+	writeVector( ferror, numpoints, "./ferror.txt", 1 ); //host
+	writeVector( herror, numpoints, "./herror.txt", 1 ); //host
+
+	//write dx.^2 here
+        for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 2.); 
+	writeVector( constPoint, numpoints, "./dxs_2.txt", 1 ); //host
+
+	//write dx.^3 here
+        for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 3.); 
+	writeVector( constPoint, numpoints, "./dxs_3.txt", 1 ); //host
+}
+
+
+////////////////////////
+////HOST Computations Here. 
+////////////////////////
+real hostFunctionExact( real *features, real *target, real *weights, int numclasses, int rows, int cols)
+{
+	real logpart = 0; 
+	real classpart = 0; 
+	real dot, sumexp;
+
+	real maxdot = 0; 
+	
+	for (int i = 0; i < rows; i ++) {
+
+		sumexp = 0;
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) 
+				dot += features[ j * rows + i ] * weights[ c * cols + j ]; 	
+
+			if (maxdot < dot) maxdot = dot; 
+
+			sumexp += exp( dot );
+		} 
+		logpart += log( 1 + sumexp );
+
+
+		int  myclass = (int)(target[ i ] - 1.); 
+
+		dot = 0; 
+		if (myclass < numclasses)
+			for (int j = 0; j < cols; j ++)
+				dot += features[ j * rows + i ] * weights[ myclass * cols + j ];
+		
+		classpart += dot; 
+	}	
+
+	return (logpart - classpart) / ((real) rows); 
+}
+
+
+
+real hostFunction( real *features, real *target, real *weights, int numclasses, int rows, int cols)
+{
+	real logpart = 0; 
+	real classpart = 0; 
+	real dot, alphax, maxdot, sumexp;
+	
+	for (int i = 0; i < rows; i ++) {
+
+		maxdot = 0; 
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) 
+				dot += features[ j * rows + i ] * weights[ c * cols + j ]; 	
+
+			if (dot > maxdot ) maxdot = dot;
+		} 
+
+		sumexp = 0;
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) 
+				dot += features[ j * rows + i ] * weights[ c * cols + j ]; 	
+
+			sumexp += exp( dot - maxdot );
+		} 
+		alphax = exp( -1. * (maxdot) ) + sumexp; 
+		logpart += (maxdot + log( alphax ));
+
+
+		int  myclass = (int)(target[ i ] - 1.); 
+
+		dot = 0; 
+		if (myclass < numclasses)
+			for (int j = 0; j < cols; j ++)
+				dot += features[ j * rows + i ] * weights[ myclass * cols + j ];
+		
+		classpart += dot; 
+	}	
+
+	//return (logpart - classpart) / ((real) rows); 
+	return (logpart - classpart); 
+}
+
+void hostGradientExact( real *features, real *target, int numclasses, int rows, int cols, real *weights, real *gradient)
+{
+	int myclass = 0; 
+	real dot = 0, sumexp = 0; 
+	real pi;
+
+	memset( gradient, 0, sizeof(real) * numclasses * cols );
+
+	for (int i = 0; i < rows; i ++) {
+		myclass = (int)(target[ i ] - 1.); 
+
+		sumexp = 0; 
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) 
+				dot += features[ j * rows + i ] * weights[ c * cols + j ]; 	
+			sumexp += exp( dot );
+		} 
+
+
+		for (int c = 0; c < numclasses; c ++){
+			pi = 0; 
+			for (int j = 0; j < cols; j ++)
+				pi += features[ j * rows + i ] * weights[ c * cols + j ];
+
+			pi = exp(pi) / (1 + sumexp);
+
+			for (int j = 0; j < cols; j ++){
+				gradient[ c * cols + j ] += (pi - ((myclass == c) ? 1. : 0.)) * features[ j * rows + i ]; 
+			}
+		}
+	}
+	for (int i = 0; i < numclasses * cols; i ++) gradient[i] = gradient[i] / ((real) rows);
+}
+
+void hostGradient( real *features, real *target, int numclasses, int rows, int cols, real *weights, real *gradient)
+{
+	int myclass = 0; 
+	real dot = 0, maxdot = 0, sumexp = 0, alphax = 0; 
+	real pi;
+
+	memset( gradient, 0, sizeof(real) * numclasses * cols );
+
+	for (int i = 0; i < rows; i ++) {
+		sumexp = maxdot = alphax = 0;
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) 
+				dot += features[ j * rows + i ] * weights[ c * cols + j ]; 	
+			if (dot > maxdot) maxdot = dot; 
+		} 
+
+		myclass = (int)(target[ i ] - 1.); 
+
+		sumexp = 0; 
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) 
+				dot += features[ j * rows + i ] * weights[ c * cols + j ]; 	
+			sumexp += exp( dot - maxdot);
+		} 
+		alphax = exp( -1. * (maxdot ) ) + sumexp;
+
+
+		for (int c = 0; c < numclasses; c ++){
+			pi = 0; 
+			for (int j = 0; j < cols; j ++)
+				pi += features[ j * rows + i ] * weights[ c * cols + j ];
+
+			pi = exp(pi - maxdot) / alphax;
+
+			for (int j = 0; j < cols; j ++){
+				gradient[ c * cols + j ] += (pi - ((myclass == c) ? 1. : 0.)) * features[ j * rows + i ]; 
+			}
+		}
+	}	
+
+	//for (int i = 0; i < numclasses * cols; i ++) gradient[i] = gradient[i] / ((real) rows);
+}
+
+void computeScale( real *features, real *target, real *weights, int numclasses, int rows, int cols, real *scale, int a, int b)
+{
+	real sumexp, pa, pb, dot; 
+	real maxdot; 
+
+	if (a == b) {
+		for (int i = 0; i < rows; i ++) {
+			maxdot = 0; 
+			for (int c = 0; c < numclasses; c ++) { 
+				dot = 0; 
+				for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ];
+				if (maxdot < dot) maxdot = dot; 
+			}
+
+			sumexp = 0; 
+			for (int c = 0; c < numclasses; c ++) { 
+				dot = 0; 
+				for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ];
+				sumexp += exp( dot - maxdot ); 
+			}
+			sumexp += exp( -1. * maxdot ); 
+
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ a * cols + j ];
+
+			scale [ i ] = (exp(dot - maxdot) / sumexp) * (1. - (exp(dot - maxdot)/sumexp));
+		}
+	}	
+	else {
+		for (int i = 0; i < rows; i ++) {
+
+			maxdot = 0; 
+			for (int c = 0; c < numclasses; c ++) { 
+				dot = 0; 
+				for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ];
+				if (maxdot < dot) maxdot = dot; 
+			}
+
+			sumexp = 0; 
+			for (int c = 0; c < numclasses; c ++) { 
+				dot = 0; 
+				for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ];
+				sumexp += exp( dot - maxdot);
+			}
+			sumexp += exp(-1 * maxdot ); 
+
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ a * cols + j ];
+
+			pa = exp( dot - maxdot) / sumexp; 
+
+			dot = 0; 
+			for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ b * cols + j ];
+			pb = exp( dot - maxdot) / sumexp; 
+
+			scale[ i ] = -1. * pa * pb; 
+		}
+	}
+}
+
+void computescalex (real *features, real *target, int numclasses, int rows, int cols, real *scale, real *temp )
+{
+	for (int i = 0; i < rows; i ++)
+		for (int j = 0; j < cols; j ++) 
+			temp[ j * rows + i ] = scale[ i ] * features[ j * rows + i ];
+		
+}
+void computextscale (real *features, real *target, int numclasses, int rows, int cols, real *temp, real *block )
+{
+	memset( block, 0, sizeof(real) * cols * cols );
+
+	for (int i = 0; i < cols; i ++){
+		for (int j = 0; j < cols; j ++) {
+			for (int k = 0; k < rows; k ++) {
+				block[ i * cols + j ] += 
+					features[i * rows + k] * temp[j * rows + k];
+			}
+			//block[ i * cols + j ] /= (real) rows;
+		}
+	}
+
+
+	//column major * times * column major format here. 
+/*
+	for (int i = 0; i < cols; i++) {
+		for (int j = 0; j < cols; j ++){
+			for (int k = 0; k < rows; k ++){
+				block[ i * cols + j ] += 
+					features[ i * rows + k ] * temp[ j * rows + k ]; 
+			}
+		}
+	}
+
+	for (int i = 0; i < cols; i++) {
+		for (int j = 0; j < cols; j ++){
+			block[ i * cols + j ] = block[ i * cols + j ] / (real) rows; 
+		}
+	}
+*/
+}
+
+
+void hostHessian( real *features, real *target, int numclasses, int rows, int cols, real *weights, real *hessian, real *s )
+{	
+	real *scale = s; 
+	real *temp = scale + rows;
+	real *block = temp + rows * cols; 
+	real *offset;
+
+	memset( hessian, 0, sizeof(real) * numclasses * numclasses * cols * cols ); 
+
+	for (int i = 0; i < numclasses; i ++){
+		for (int j = 0; j < numclasses; j ++){
+			computeScale ( features, target, weights, numclasses, rows, cols, scale, i, j ); 
+			//for ( int k = 0; k < rows; k ++) scale[k] = 1.;
+			
+			computescalex( features, target, numclasses, rows, cols, scale, temp );
+			computextscale( features, target, numclasses, rows, cols, temp, block );
+
+			offset = hessian + i * (numclasses * cols) * cols + j * cols;
+			for (int k = 0; k < cols; k ++)
+				memcpy( offset + k * numclasses * cols, block + k * cols, sizeof(real) * cols ); 
+		}
+	}
+}
+
+void hostHessianVector( real *features, real *target, real *weights, int numclasses, int rows, int cols, 
+				real *vector, real *result, real *temp) {
+	real *A = temp; 
+	real *B = A + rows * numclasses; 
+	real *C = B + rows * numclasses; 
+
+	real dot, sumexp, maxdot; 
+	real pw, sum; 
+
+	memset( result, 0, sizeof(real) * numclasses * cols );
+
+	//compute A. - stored in column major order
+	for (int i = 0; i < rows; i ++){
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++){
+				dot += features[ j * rows + i ] * vector[ c * cols + j]; 
+			}
+			//A[ i * numclasses + c ] = dot; 
+			A[ c * rows + i ] = dot; 
+		}
+	}
+
+	//compute B here. - stored in column major order
+	for (int i = 0; i < rows; i ++){
+		maxdot = 0; 
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++)
+				dot += features[j * rows + i ] * weights[ c * cols + j ];
+			
+			if (maxdot < dot) maxdot = dot; 
+		}
+
+		sumexp = 0; 
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++)
+				dot += features[j * rows + i ] * weights[ c * cols + j ];
+			
+			sumexp += exp( dot - maxdot ); 
+		}
+		sumexp += exp( -1 * maxdot ); 
+
+		for (int c = 0; c < numclasses; c ++){
+			dot = 0; 
+			for (int j = 0; j < cols; j ++)
+				dot += features[j * rows + i ] * weights[ c * cols + j ];
+
+			pw = exp( dot - maxdot ) / sumexp; 
+			//B[ i * numclasses + c ] = pw; 
+			B[ c * rows + i ] = pw; 
+		}
+	}
+
+	//compute C here.  - stored in column major order
+	for (int i = 0; i < rows; i ++){
+		sum = 0; 
+		for (int k = 0; k < numclasses; k ++) 
+			//sum += A[ i * numclasses + k ] * B [ i * numclasses + k ];
+			sum += A[ k * rows + i ] * B [ k * rows + i ];
+
+		for (int j = 0; j < numclasses; j ++){
+			/*
+			C[ i * numclasses + j ] = 
+				A[ i * numclasses + j ] * B [ i * numclasses + j ] - 
+				B[ i * numclasses + j ] * sum; 
+			*/
+			C[ j * rows + i ] = 
+				A[ j * rows + i ] * B [ j * rows + i ] - 
+				B[ j * rows + i ] * sum; 
+			
+		}
+	}
+
+	//compute Hessian * vector here. 
+/*
+	for (int i = 0; i < cols; i ++){
+		for (int j = 0; j < numclasses; j ++){
+			for (int k = 0; k < rows; k ++) {
+				result[ j * cols + i ] += 
+					features[ i * rows + k ] * C[k * numclasses + j ];
+			}
+		}
+	}
+*/
+
+
+
+	//Compute XT * C = stored in column major format
+	for (int i = 0; i < cols; i ++){
+		for (int j = 0; j < numclasses; j ++){
+			for (int k = 0; k < rows; k ++){
+				result[ j * cols + i ] += features[ i * rows + k ] * C[ j * rows + k ]; 
+			}
+		}
+	}
+}
+
+void hostDerivativeTest ( real *features, real *target, int rows, int cols, int numclasses,
+			real *hostPtr, real *devPtr, int numpoints)
+{
+	int offset = (numclasses * cols) % 4; 
+
+	real *constPoint = hostPtr;
+	real *hostPoint = constPoint + numclasses * cols + offset; 
+	real *dx = hostPoint + numclasses * cols + offset; 
+	real *ferror = dx + numclasses * cols + offset;
+	real *dxs = ferror + numpoints;
+	real *gradient = dxs + numpoints; 
+	real *hessian = gradient + numclasses * cols; 
+	real *herror = hessian + numclasses * numclasses * cols * cols ; 
+	real *Hv = herror + numpoints; 
+	real *hexplicit = Hv + numclasses * cols; 
+	real *nextHostPtr = hexplicit + numpoints;
+
+	real f;
+	real f0;
+	real vv = 0; 
+	real vhve, vhv, sum;
+	real dxnrm = 0; 
+
+/*
+	for (int i = 0; i < cols; i ++)
+		for (int j = 0; j < rows; j ++)
+			features[i * rows + j ] = i+1; 
+
+	printHostVector( features, 10 ); 
+	printHostVector( features + rows, 10 ); 
+	printHostVector( features + 2*rows, 10 ); 
+*/
+
+
+	fprintf( stderr, "Number of random numbers to be generated: %d, %d, %d \n", (numclasses) * cols, rows, cols );
+
+	memset( constPoint, 0, sizeof(real) * (numclasses) * cols );
+	for (int i = 0; i < (numclasses) * cols; i ++ ) constPoint[i] = 1.0; 
+	//getRandomVector((numclasses) * cols, dx, devPtr);
+	//for (int i = 0; i < (numclasses) * cols; i ++ ) dx[i] = 1.0; 
+	int count = readVector( dx, numclasses * cols, "dx_forest.txt", 0);
+	fprintf( stderr, "Total Points read from file: %d \n", count ); 
+
+        f0 = hostFunction(features, target, constPoint, numclasses, rows, cols );
+        hostGradient(features, target, numclasses, rows, cols, constPoint, gradient);
+	//hostHessian( features, target, numclasses, rows, cols, constPoint, hessian, nextHostPtr);
+
+/*
+	fprintf( stderr, "Hessian Matrix.... \n"); 
+	for (int i = 0; i < numclasses * cols; i ++){
+		for (int j = 0; j < numclasses * cols; j ++) 
+			fprintf (stderr, " %e ", hessian[ i * numclasses * cols + j ] ); 
+		fprintf (stderr, "\n");
+	}
+
+	fprintf( stderr, "Explicit Hessian vecotr product \n"); 
+	for (int j = 0; j < numclasses * cols; j ++) {
+		sum = 0; 
+		for (int k = 0; k < numclasses * cols; k ++)
+			sum += hessian[ j * numclasses * cols + k ] * dx[k]; 
+		fprintf( stderr, " %e ", sum ); 
+	}
+	fprintf( stderr, "\n");
+
+	
+
+	hostHessianVector( features, target, constPoint, numclasses, rows, cols, dx, Hv, nextHostPtr );
+	fprintf( stderr, "Hessian vecotr product \n"); 
+	printHostVector( Hv, numclasses * cols ); 
+
+	exit (-1); 
+*/
+
+	fprintf( stderr, " Function at 0: %f \n", f0);
+	//printHostVector( gradient, numclasses * cols );
+
+
+        for (int i = 0; i < numpoints; i ++) {
+                for (int j = 0; j < (numclasses)*cols; j ++) hostPoint[j] = constPoint[j] + dx[j];
+
+        	f = hostFunction(features, target, hostPoint, numclasses, rows, cols );
+
+		/*first order error*/
+		vv = 0; 
+		for (int j = 0; j < (numclasses) * cols; j ++) vv += gradient[j] * dx[j];
+		ferror[i] = f - (f0 + vv);
+
+		/* second order error */ 
+		vhv = vhve = 0; 
+		
+		/*
+		for (int j = 0; j < numclasses * cols; j ++) {
+			sum = 0; 
+			for (int k = 0; k < numclasses * cols; k ++)
+				sum += hessian[ j * numclasses * cols + k ] * dx[k]; 
+			
+			//fprintf( stderr, " %e ", sum ); 
+			vhve += dx[j] * sum; 	
+		}	
+		//fprintf( stderr, "\n"); 
+		*/
+		
+
+		hostHessianVector( features, target, constPoint, numclasses, rows, cols, dx, Hv, nextHostPtr );
+		//printHostVector( Hv, numclasses * cols ); 
+
+		//for (int j = 0; j < numclasses * cols ; j ++) vhv += Hv[ j ] * dx[ j ] / (real) rows; 
+		for (int j = 0; j < numclasses * cols ; j ++) vhv += Hv[ j ] * dx[ j ]; 
+		
+		//hexplicit[i] = f - (f0 + vv + 0.5 * vhve);
+		herror[i] = f - (f0 + vv + 0.5 * vhv);
+
+		/*dxs here. */
+		dxnrm = 0;
+		for (int j = 0; j < (numclasses) * cols; j ++) dxnrm += dx[j] * dx[j];
+		dxs[i] = sqrt( dxnrm );
+
+                for (int j = 0; j < (numclasses)*cols; j ++) dx[j] = dx[j] / 2.0;
+
+		fprintf( stderr, "%d: f : %e, vv : %e, ferr: %e, dx_2: %e, vhv: %e, herr: %e, dx_3: %e\n", 
+				i, f, vv, ferror[i], pow(dxs[i], 2.0), vhv, herror[i], pow(dxs[i], 3.) );
+		//fprintf( stderr, "%d: f : %e, vv : %e, ferr: %e, dx_2: %e, vhve: %e, herr: %e, dx_3: %e\n", 
+		//		i, f, vv, ferror[i], pow(dxs[i], 2.0), vhve, hexplicit[i], pow(dxs[i], 3.) );
+
+	}
+
+	writeVector( ferror, numpoints, "./ferror.txt", 1 ); /* host */
+	writeVector( herror, numpoints, "./herror.txt", 1 ); /* host */
+	//writeVector( hexplicit, numpoints, "./hexplicit.txt", 1 ); /* host */
+
+	/* write dx.^2 here */
+        for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 2.); 
+	writeVector( hostPtr, numpoints, "./dxs_2.txt", 1 ); /* host */
+
+        for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 3.); 
+	writeVector( hostPtr, numpoints, "./dxs_3.txt", 1 ); /* host */
+}
diff --git a/code/cuda/RC-FINAL-5/softmax_multiclass.h b/code/cuda/RC-FINAL-5/softmax_multiclass.h
new file mode 100644
index 0000000..a48995e
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/softmax_multiclass.h
@@ -0,0 +1,58 @@
+#ifndef __SOFTMAX_MULTICLASS_H__
+#define __SOFTMAX_MULTICLASS_H__
+
+#include "cuda_types.h"
+#include "dataset.h"
+
+
+int generateNonUniformSample( real *probs, real *scaleTerm, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr);
+void computeRowProbabilities( SparseDataset *spfeatures, real *features, int rows, int cols, int numclasses,
+                        real *dHXW, real *rowNrms, real *probs, real *devPtr );
+void computeRowNorms( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr );
+void computeDiagHXW( real *XW, int rows, int num_classes, real *dXW );
+
+
+real softmax_multiclass_fx (SparseDataset *, real *, real *, int , int , int, real *,
+                                real , real *, real *, real *);
+void softmax_multiclass_gx (real *, real *, int , int ,
+                                int , real *, real , real *, 
+                                real *, real *, real *);
+void softmax_multiclass_hx (real *, int , int , int ,
+                                real *, real *, real ,
+                                real *, real *, real *, real *, real *, int);
+
+void computeHXW (SparseDataset *, real *features, int rows, int cols, int num_classes, real *weights, real *XW, int subSampling );
+
+void computeExpSum( real *XW, int rows, int cols, int num_classes, real *expSumVec );
+
+void softmax_multiclass_gx_optimized (SparseDataset *, real *features, real *target, int rows, int cols, int num_classes,
+                        real *weights, real lambda, real *XW, real *gradient,
+                        real *devPtr, real *hostPtr, real *pageLckPtr);
+
+void softmax_multiclass_gx_subsampled(SparseDataset *, real *features, real *target, int rows, int cols, int num_classes,
+                        real *weights, real lambda, real *gradient,
+                        real *devPtr, real *hostPtr, real *pageLckPtr, 
+			SparseDataset *, real *, SparseDataset *, real *, int, int );
+
+void softmax_multiclass_hx_subsampled(SparseDataset *, real *features, int rows, int cols, int num_classes,
+                                real *weights, real *vector, real lambda,
+                                real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B, 
+				SparseDataset *, real *, SparseDataset *, int, real *, int );
+
+void softmax_multiclass_hx_optimized (SparseDataset *, real *features, int rows, int cols, int num_classes,
+                                real *weights, real *vector, real lambda,
+                                real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B ); 
+
+real softmax_predict(SparseDataset *, real *, real *, real *, int , int , int ,
+                        real *, real *, int, real *);
+
+void expTest( real *results, int count, real *host);
+
+void computeErrors ( real *, real *, int , int , int ,
+                        real *, real *, real *, int );
+
+
+void hostDerivativeTest ( real *, real *, int , int , int ,
+                        real *, real *, int);
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/sparse_dataset.cu b/code/cuda/RC-FINAL-5/sparse_dataset.cu
new file mode 100644
index 0000000..22d01b7
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/sparse_dataset.cu
@@ -0,0 +1,178 @@
+
+#include "cuda_types.h"
+#include "cuda_utils.h"
+#include "sparse_dataset.h"
+
+void initMatDescriptors( DeviceDataset *d )
+{
+	//Train
+	cusparseCheckError ( cusparseCreateMatDescr(&(d->spTrain.descr)) ); 
+	cusparseCheckError ( cusparseSetMatIndexBase(d->spTrain.descr, CUSPARSE_INDEX_BASE_ZERO) );
+	cusparseCheckError ( cusparseSetMatType(d->spTrain.descr, CUSPARSE_MATRIX_TYPE_GENERAL) );
+
+	//Test
+	cusparseCheckError ( cusparseCreateMatDescr(&(d->spTest.descr)) ); 
+	cusparseCheckError ( cusparseSetMatIndexBase(d->spTest.descr, CUSPARSE_INDEX_BASE_ZERO) );
+	cusparseCheckError ( cusparseSetMatType(d->spTest.descr, CUSPARSE_MATRIX_TYPE_GENERAL) );
+}
+
+void initMatDescriptorsForSampling( DeviceDataset *d ) {
+
+	//SubSampling - Hessian
+	cusparseCheckError ( cusparseCreateMatDescr(&(d->spHessianSample.descr)) ); 
+	cusparseCheckError ( cusparseSetMatIndexBase(d->spHessianSample.descr, CUSPARSE_INDEX_BASE_ZERO) );
+	cusparseCheckError ( cusparseSetMatType(d->spHessianSample.descr, CUSPARSE_MATRIX_TYPE_GENERAL) );
+
+	//gradient
+	cusparseCheckError ( cusparseCreateMatDescr(&(d->spGradientSample.descr)) ); 
+	cusparseCheckError ( cusparseSetMatIndexBase(d->spGradientSample.descr, CUSPARSE_INDEX_BASE_ZERO) );
+	cusparseCheckError ( cusparseSetMatType(d->spGradientSample.descr, CUSPARSE_MATRIX_TYPE_GENERAL) );
+}
+
+void initMatDescriptorsForSparseSampling( DeviceDataset *d ) {
+
+	//SubSampling - Hessian
+	cusparseCheckError ( cusparseCreateMatDescr(&(d->spSampledHessianTrain.descr)) ); 
+	cusparseCheckError ( cusparseSetMatIndexBase(d->spSampledHessianTrain.descr, CUSPARSE_INDEX_BASE_ZERO) );
+	cusparseCheckError ( cusparseSetMatType(d->spSampledHessianTrain.descr, CUSPARSE_MATRIX_TYPE_GENERAL) );
+
+	//gradient
+	cusparseCheckError ( cusparseCreateMatDescr(&(d->spSampledGradientTrain.descr)) ); 
+	cusparseCheckError ( cusparseSetMatIndexBase(d->spSampledGradientTrain.descr, CUSPARSE_INDEX_BASE_ZERO) );
+	cusparseCheckError ( cusparseSetMatType(d->spSampledGradientTrain.descr, CUSPARSE_MATRIX_TYPE_GENERAL) );
+}
+
+void convertGradientSampleToCSR (SparseDataset *spGradientSample, int sampleSize, int cols, real *devPtr) {
+
+	//make sure that the data is sorted here. 
+	size_t pBufferSizeInBytes = 0; 
+	void* pBuffer = (void *)devPtr; 
+
+	//Sampled Dataset Here. 
+	cusparseCheckError( 
+			cusparseXcoosort_bufferSizeExt( 
+				cusparseHandle, sampleSize, cols, spGradientSample->nnz, 
+				spGradientSample->rowPtr, spGradientSample->colPtr, &pBufferSizeInBytes ) ); 
+
+	cusparseCheckError( 
+		cusparseCreateIdentityPermutation( cusparseHandle, spGradientSample->nnz, spGradientSample->P) ); 
+	
+	cusparseCheckError( 
+		cusparseXcoosortByRow( cusparseHandle, sampleSize, cols, spGradientSample->nnz, 
+				spGradientSample->rowPtr, spGradientSample->colPtr, spGradientSample->P, pBuffer ) ); 
+
+	cusparseCheckError( 
+		cusparseDgthr( cusparseHandle, spGradientSample->nnz, spGradientSample->valPtr, 
+				spGradientSample->sortedVals, spGradientSample->P, CUSPARSE_INDEX_BASE_ZERO ) ); 
+
+	//convert to csr format. 
+	cusparseCheckError( 
+			cusparseXcoo2csr( cusparseHandle, spGradientSample->rowPtr, spGradientSample->nnz, sampleSize, 
+				spGradientSample->rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) 
+		); 	
+
+	//fprintf( stderr, "Converting gradient to CSR .... \n"); 
+}
+
+
+void convertHessianSampleToCSR (SparseDataset *spHessianSample, int sampleSize, int cols, real *devPtr) {
+
+	//make sure that the data is sorted here. 
+	size_t pBufferSizeInBytes = 0; 
+	void* pBuffer = (void *)devPtr; 
+
+	//Sampled Dataset Here. 
+	cusparseCheckError( 
+			cusparseXcoosort_bufferSizeExt( 
+				cusparseHandle, sampleSize, cols, spHessianSample->nnz, 
+				spHessianSample->rowPtr, spHessianSample->colPtr, &pBufferSizeInBytes ) ); 
+
+	cusparseCheckError( 
+		cusparseCreateIdentityPermutation( cusparseHandle, spHessianSample->nnz, spHessianSample->P) ); 
+	
+	cusparseCheckError( 
+		cusparseXcoosortByRow( cusparseHandle, sampleSize, cols, spHessianSample->nnz, 
+				spHessianSample->rowPtr, spHessianSample->colPtr, spHessianSample->P, pBuffer ) ); 
+
+	cusparseCheckError( 
+		cusparseDgthr( cusparseHandle, spHessianSample->nnz, spHessianSample->valPtr, 
+				spHessianSample->sortedVals, spHessianSample->P, CUSPARSE_INDEX_BASE_ZERO ) ); 
+
+	//convert to csr format. 
+	cusparseCheckError( 
+			cusparseXcoo2csr( cusparseHandle, spHessianSample->rowPtr, spHessianSample->nnz, sampleSize, 
+				spHessianSample->rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) 
+		); 	
+
+	//fprintf( stderr, "Converting hessian to CSR .... \n"); 
+}
+
+void convertToCSR( DeviceDataset *d, real *devPtr )
+{
+	//make sure that the data is sorted here. 
+	size_t pBufferSizeInBytes = 0; 
+	void* pBuffer = (void *)devPtr; 
+
+	//Train Dataset Here. 
+	cusparseCheckError( 
+			cusparseXcoosort_bufferSizeExt( 
+				cusparseHandle, d->rows, d->cols, d->spTrain.nnz, 
+				d->spTrain.rowPtr, d->spTrain.colPtr, &pBufferSizeInBytes ) ); 
+	fprintf( stderr, "Memory needed to sort coo data --> %d \n", pBufferSizeInBytes ); 
+
+	cusparseCheckError( 
+		cusparseCreateIdentityPermutation( cusparseHandle, d->spTrain.nnz, d->spTrain.P) ); 
+	
+	cusparseCheckError( 
+		cusparseXcoosortByRow( cusparseHandle, d->rows, d->cols, d->spTrain.nnz, 
+				d->spTrain.rowPtr, d->spTrain.colPtr, d->spTrain.P, pBuffer ) ); 
+
+	cusparseCheckError( 
+		cusparseDgthr( cusparseHandle, d->spTrain.nnz, d->spTrain.valPtr, 
+				d->spTrain.sortedVals, d->spTrain.P, CUSPARSE_INDEX_BASE_ZERO ) ); 
+
+	//convert to csr format. 
+	cusparseCheckError( 
+			cusparseXcoo2csr( cusparseHandle, d->spTrain.rowPtr, d->spTrain.nnz, d->rows, 
+				d->spTrain.rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) 
+		); 	
+
+
+	//Test Dataset here. 
+	cusparseCheckError( 
+			cusparseXcoosort_bufferSizeExt( 
+				cusparseHandle, d->rows, d->cols, d->spTest.nnz, 
+				d->spTest.rowPtr, d->spTest.colPtr, &pBufferSizeInBytes ) ); 
+	fprintf( stderr, "Memory needed to sort coo data --> %d \n", pBufferSizeInBytes ); 
+
+	cusparseCheckError( 
+		cusparseCreateIdentityPermutation( cusparseHandle, d->spTest.nnz, d->spTest.P) ); 
+	
+	cusparseCheckError( 
+		cusparseXcoosortByRow( cusparseHandle, d->rows, d->cols, d->spTest.nnz, 
+				d->spTest.rowPtr, d->spTest.colPtr, d->spTest.P, pBuffer ) ); 
+
+	cusparseCheckError( 
+		cusparseDgthr( cusparseHandle, d->spTest.nnz, d->spTest.valPtr, 
+				d->spTest.sortedVals, d->spTest.P, CUSPARSE_INDEX_BASE_ZERO ) ); 
+
+	//convert to csr format. 
+	cusparseCheckError( 
+			cusparseXcoo2csr( cusparseHandle, d->spTest.rowPtr, d->spTest.nnz, d->rows, 
+				d->spTest.rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) 
+		); 	
+
+/*
+	cusparseCheckError( 
+			cusparseXcoo2csr( cusparseHandle, d->spTest.rowPtr, d->spTest.nnz, d->testSize, 
+				d->spTest.rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) 
+		); 	
+
+	//convert the csr matrix to csc matrix here. 
+	cusparseCheckError( 
+			cusparseDcsr2csc( cusparseHandle, d->rows, d->cols, d->spTrain.nnz, 
+					d->spTrain.valPtr, d->spTrain.rowCsrPtr, d->spTrain.colPtr, 
+					d->spTrain.cscValPtr, d->spTrain.cscRowPtr, d->spTrain.cscColPtr, 
+					CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO ) ); 
+*/
+}
diff --git a/code/cuda/RC-FINAL-5/sparse_dataset.h b/code/cuda/RC-FINAL-5/sparse_dataset.h
new file mode 100644
index 0000000..49b9fa6
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/sparse_dataset.h
@@ -0,0 +1,14 @@
+#ifndef __H_SPARSE_DATASET__
+#define __H_SPARSE_DATASET__
+
+#include "dataset.h"
+
+void convertToCSR( DeviceDataset *, real * ); 
+void convertHessianSampleToCSR (SparseDataset *spSampleHessian, int sampleSize, int cols, real *devPtr);
+void convertGradientSampleToCSR (SparseDataset *spSampleHessian, int sampleSize, int cols, real *devPtr);
+
+void initMatDescriptors( DeviceDataset *d );
+void initMatDescriptorsForSampling( DeviceDataset *d );
+void initMatDescriptorsForSparseSampling( DeviceDataset *d );
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/subsampling_helpers.cu b/code/cuda/RC-FINAL-5/subsampling_helpers.cu
new file mode 100644
index 0000000..538e982
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/subsampling_helpers.cu
@@ -0,0 +1,122 @@
+
+#include <subsampling_helpers.h> 
+
+#include "cuda_utils.h"
+#include "print_utils.h"
+#include "gen_random.h"
+
+GLOBAL void kerInitSampleMatrix( int *row, int *col, real *val, real *labels, real *srcLabels, int count, int offset, int maxRows )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	if (idx < count) {
+		row[ idx ] = idx; 
+		val[ idx ] = 1.; 
+
+		//reshuffle the labels here. 	
+		labels[ idx ] = srcLabels[ col[ idx ] ] ; 
+	}
+}
+
+GLOBAL void kerInitSampleMatrixNoLabels( int *row, int *col, real *val, int count, int offset, int maxRows )
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+	if (idx < count) {
+		row[ idx ] = idx; 
+		val[ idx ] = 1.; 
+	}
+}
+
+void initSubSampledHessian( int offset, int rows, SparseDataset *sampledSet, real *sampledLabels, real *srcLabels, int sampledSize ){
+
+	int blocks = (sampledSize / BLOCK_SIZE) + 
+			(((sampledSize % BLOCK_SIZE) == 0) ? 0 : 1) ;
+
+	if (sampledLabels == NULL && srcLabels == NULL){
+		kerInitSampleMatrixNoLabels <<< blocks, BLOCK_SIZE >>> 
+			(sampledSet->rowPtr, sampledSet->colPtr, sampledSet->valPtr, sampledSize, offset, rows ); 
+	} else {
+		kerInitSampleMatrix <<< blocks, BLOCK_SIZE >>> 
+			(sampledSet->rowPtr, sampledSet->colPtr, sampledSet->valPtr, sampledLabels, srcLabels, 
+				sampledSize, offset, rows ); 
+	}
+	cudaThreadSynchronize (); 
+	cudaCheckError (); 
+}
+
+void prepareForNonUniformSampling (SparseDataset *samplingMat, int sampleSize, int *indices) {
+
+        copy_host_device( indices, samplingMat->colPtr, sizeof(int) * sampleSize,
+                        cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE );
+        initSubSampledHessian( -1, -1, samplingMat, NULL, NULL, sampleSize);
+}
+
+void prepareForSampling (SparseDataset *sampledGradient, real *sampledLabels, real *srcLabels, int rows, int sampleSize, int *hostPtr) {
+
+        int startRow = -1;
+
+	//generate random rows here for sampling. 
+	//genRandomVector( hostPtr, sampleSize, rows ); 	
+	genRandomVector( hostPtr, sampleSize, rows - 1 ); 	
+
+	copy_host_device( hostPtr, sampledGradient->colPtr, sizeof(int) * sampleSize, 
+			cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); 	
+
+        startRow = rand () % rows;
+        initSubSampledHessian( startRow, rows, sampledGradient, sampledLabels, srcLabels, sampleSize);
+}
+
+void sampleDataset ( SparseDataset *spSampledGradient, real *dataset, 
+			int rows, int cols, int num_classes, 
+			real *subSampledGradient, int sampleSize )
+{
+	real alpha = 1.0; 
+	real beta = 0; 
+
+	cusparseCheckError (
+        	cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                	sampleSize, cols, rows, spSampledGradient->nnz,
+                        &alpha, spSampledGradient->descr, spSampledGradient->sortedVals, spSampledGradient->rowCsrPtr,
+                        spSampledGradient->colPtr, dataset, rows, &beta, subSampledGradient, sampleSize)
+                        );
+}
+
+void sampleSparseDataset ( SparseDataset *spSampler, SparseDataset *spDataset, 
+				int rows, int cols, int num_classes, 
+				SparseDataset *spGradientSample, int sampleSize )
+{
+	int *nnzHostPtr = &spGradientSample->nnz; 
+	int baseC = 0; 
+
+	cusparseCheckError( 
+		cusparseSetPointerMode( cusparseHandle, CUSPARSE_POINTER_MODE_HOST) );
+	
+	cusparseCheckError (
+		cusparseXcsrgemmNnz( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+			//sampleSize, cols, sampleSize, 
+			sampleSize, cols, rows, 
+			spSampler->descr, spSampler->nnz, spSampler->rowCsrPtr, spSampler->colPtr, 
+			spDataset->descr, spDataset->nnz, spDataset->rowCsrPtr, spDataset->colPtr, 
+			spGradientSample->descr, spGradientSample->rowCsrPtr, nnzHostPtr
+			) ); 
+
+	if (nnzHostPtr != NULL){
+		spGradientSample->nnz = *nnzHostPtr; 
+	} else {
+		cudaMemcpy( &spGradientSample->nnz, spGradientSample->rowCsrPtr + sampleSize, sizeof(int), 
+					cudaMemcpyDeviceToHost ); 
+		cudaMemcpy( &baseC, spGradientSample->rowCsrPtr, sizeof(int), cudaMemcpyDeviceToHost ); 
+		
+		spGradientSample->nnz -= baseC; 
+	}
+
+	cusparseCheckError (
+		cusparseDcsrgemm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 
+			//sampleSize, cols, sampleSize, 
+			sampleSize, cols, rows, 
+			spSampler->descr, spSampler->nnz, spSampler->sortedVals, spSampler->rowCsrPtr, spSampler->colPtr, 
+			spDataset->descr, spDataset->nnz, spDataset->sortedVals, spDataset->rowCsrPtr, spDataset->colPtr, 
+			spGradientSample->descr, spGradientSample->sortedVals, 
+				spGradientSample->rowCsrPtr, spGradientSample->colPtr ) ); 
+}
diff --git a/code/cuda/RC-FINAL-5/subsampling_helpers.h b/code/cuda/RC-FINAL-5/subsampling_helpers.h
new file mode 100644
index 0000000..8f67208
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/subsampling_helpers.h
@@ -0,0 +1,20 @@
+#ifndef __SUB_SAMPLING_HELPERS_H__
+#define __SUB_SAMPLING_HELPERS_H__
+
+#include "dataset.h"
+#include "cuda_types.h"
+
+void initSubSampledHessian( int offset, int rows, SparseDataset *spSampledHessian, real *, int sampledSize );
+void prepareForNonUniformSampling (SparseDataset *samplingMat, int sampleSize, int *indices) ;
+
+
+void prepareForSampling (SparseDataset *sampledHessian, real *, real *, int rows, int sampleSize, int *hostPtr);
+void sampleDataset( SparseDataset *spSampledHessian, real *dataset,
+                        int rows, int cols, int num_classes,
+                        real *subSampledHessian, int sampleSize );
+
+void sampleSparseDataset ( SparseDataset *spSampler, SparseDataset *spDataset,
+                                int rows, int cols, int num_classes,
+                                SparseDataset *spGradientSample, int sampleSize );
+
+#endif
diff --git a/code/cuda/RC-FINAL-5/utils.c b/code/cuda/RC-FINAL-5/utils.c
new file mode 100644
index 0000000..0d64302
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/utils.c
@@ -0,0 +1,33 @@
+#include <utils.h>
+#include <sys/time.h>
+
+void allocate_memory( void **ptr, size_t s )
+{
+	*ptr = malloc( s );
+	if (*ptr == NULL){
+		fprintf( stderr, "Memory Allocation failed for size: %u\n", s );
+	}
+}
+
+void release_memory( void **ptr ){
+	free ( *ptr );
+}
+
+real Get_Time( )
+{
+  struct timeval tim;
+  
+  gettimeofday(&tim, NULL );
+  return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
+}
+
+
+real Get_Timing_Info( real t_start )
+{
+  struct timeval tim;
+  real t_end;
+  
+  gettimeofday(&tim, NULL );
+  t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+  return (t_end - t_start);
+}
diff --git a/code/cuda/RC-FINAL-5/utils.h b/code/cuda/RC-FINAL-5/utils.h
new file mode 100644
index 0000000..425a783
--- /dev/null
+++ b/code/cuda/RC-FINAL-5/utils.h
@@ -0,0 +1,12 @@
+#ifndef _H_UTILS__
+#define _H_UTILS__
+
+#include <cuda_types.h>
+
+void allocate_memory( void **, size_t);
+void release_memory( void ** );
+
+
+real Get_Time ();
+real Get_Timing_Info( real t_start );
+#endif
diff --git a/code/tensorflow/cifar/tf_softmax.py b/code/tensorflow/cifar/tf_softmax.py
new file mode 100644
index 0000000..22d86cd
--- /dev/null
+++ b/code/tensorflow/cifar/tf_softmax.py
@@ -0,0 +1,306 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+
+import cPickle as pickle
+#import pickle
+import time
+import StringIO
+
+import scipy.sparse as sparse
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+curpath = sys.argv[2]
+
+#load the data here. 
+X_train = np.loadtxt(curpath + trainmat ,delimiter=',')
+X_train = X_train.astype(np.float64)
+Y_train = np.loadtxt(curpath + trainvec ,delimiter=',')
+Y_train = Y_train.astype(np.float64)
+
+# Test
+X_test = np.loadtxt(curpath + testmat, delimiter=',')
+X_test = X_test.astype(np.float64)
+Y_test = np.loadtxt(curpath + testvec, delimiter=',')
+Y_teset = Y_test.astype(np.float64)
+
+# Convert to the usable format here. 
+print ("Done loading data..... ")
+print (X_train.shape )
+print (Y_train.shape )
+print (X_test.shape )
+print (Y_test.shape )
+
+print (len(X_train[0]))
+print (len(X_train))
+
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+Y_one_hot=[]
+for l in Y_train: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1]))
+Y_train = np.array(Y_one_hot)
+
+Y_one_hot=[]
+for l in Y_test: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1]))
+
+Y_test = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = len( Y_train[0] )
+
+## select specific element by index for each row
+def sel_ele_2d(a,b):
+    b= tf.cast(b, tf.int32)
+    b_2 = tf.expand_dims(b, 1)
+    the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1)
+    ind = tf.concat([the_range, b_2],1)
+    res = tf.gather_nd(a, ind)
+    return res
+
+
+# Create the network, tf variables and cost function here. 
+x = tf.placeholder("float64", [None, n_input])
+y = tf.placeholder("float64", [None, n_classes])
+
+#W= tf.Variable(tf.random_normal([n_input, n_classes-1]))
+W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64)
+
+Matrix_Mul=  tf.matmul(x,W)
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) )
+
+T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64)
+
+## (y==c)*e^<x,vc>
+pre_temp= tf.multiply(T,tf.exp(Matrix_concat))
+
+## 1+sigma(e^<x,cb>)
+pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) )
+
+## last-th class prob##
+pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1)
+
+## get the first 6 classes prob
+pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1])
+
+##  concat 6 classes prob with last class prob
+pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1)
+pred_labels_tf = tf.argmax(pred,1);
+
+
+## Our cost function
+cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) )
+
+## Regularization Term Here. 
+#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+
+prefix = ''
+if sys.argv[4] == 'GPU': 
+   config = tf.ConfigProto( intra_op_parallelism_threads=1)
+   prefix += 'GPU'
+else:
+   config = tf.ConfigProto(device_count={'GPU': 0} )
+   prefix += 'CPU'
+
+
+# Parameters
+training_epochs = 100
+display_step = 1
+index = 1
+
+# Parameters
+if sys.argv[3] == 'fixed' :
+   batch_size = 128
+else:
+   batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+
+if sys.argv[2].find("raw-data") != -1:
+   #lipschitz constant is 1e-12
+   # this dataset is normalized from the source
+   ll = [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+   #rterm = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4 ]
+   #rterm = [1e6]
+   rterm = [1]
+   prefix += '_raw_'
+
+else:
+   #lipschitz constant is 1e-3
+	ll = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4 ]
+   #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+   #rterm = [1e-6]
+	#rterm = [1e-4]
+	rterm = [1e-3]
+	prefix += '_norm_'
+
+for lmethod in [ sys.argv[1] ]:
+
+    for r in rterm:
+
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0)
+                index += 1
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+                elif(lmethod =="Momentum"):
+                    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+
+                    if True:
+                            correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+                            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+                            outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(total_batch):
+                            batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+                            correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+                            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+                            #", cost=","{:.9f}".format(avg_cost), \
+                            outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                            #pred_labels = sess.run( pred_labels_tf, feed_dict={x: X_test, y: Y_test} )
+
+                            #c = [0, 0, 0, 0, 0, 0, 0]
+                            #for i in range(0, len(pred_labels)): 
+                            #    c[ pred_labels[i] ] += 1
+
+                            #for i in range(0, len(c)): 
+                            #    print ("Class: %d --- > %d" % (i, c[ i ]) )
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()
diff --git a/code/tensorflow/covertype/tf_softmax.py b/code/tensorflow/covertype/tf_softmax.py
new file mode 100644
index 0000000..b2d4aa6
--- /dev/null
+++ b/code/tensorflow/covertype/tf_softmax.py
@@ -0,0 +1,265 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+
+import cPickle as pickle
+import time
+import StringIO
+
+trainmat = 'train_forest_multi_features.txt'
+trainvec = 'train_forest_multi_labels.txt'
+testmat = 'test_forest_multi_features.txt'
+testvec = 'test_forest_multi_labels.txt'
+
+curpath = sys.argv[2]
+
+#load the data here. 
+X_train = np.loadtxt(curpath + trainmat ,delimiter=',')
+X_train = X_train.astype(np.float64)
+Y_train = np.loadtxt(curpath + trainvec ,delimiter=',')
+Y_train = Y_train.astype(np.float64)
+
+# Test
+X_test = np.loadtxt(curpath + testmat, delimiter=',')
+X_test = X_test.astype(np.float64)
+Y_test = np.loadtxt(curpath + testvec, delimiter=',')
+Y_teset = Y_test.astype(np.float64)
+
+print ("Done loading data..... ")
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+Y_one_hot=[]
+for l in Y_train: 
+	if (l == 1): 
+		Y_one_hot.append(np.array([1,0,0,0,0,0,0]))
+	elif (l ==2):
+		Y_one_hot.append(np.array([0,1,0,0,0,0,0]))
+	elif (l ==3):
+		Y_one_hot.append(np.array([0,0,1,0,0,0,0]))
+	elif (l ==4):
+		Y_one_hot.append(np.array([0,0,0,1,0,0,0]))
+	elif (l ==5):
+		Y_one_hot.append(np.array([0,0,0,0,1,0,0]))
+	elif (l ==6):
+		Y_one_hot.append(np.array([0,0,0,0,0,1,0]))
+	elif (l ==7):
+		Y_one_hot.append(np.array([0,0,0,0,0,0,1]))
+Y_train = np.array(Y_one_hot)
+
+Y_one_hot=[]
+for l in Y_test: 
+	if (l == 1): 
+		Y_one_hot.append(np.array([1,0,0,0,0,0,0]))
+	elif (l ==2):
+		Y_one_hot.append(np.array([0,1,0,0,0,0,0]))
+	elif (l ==3):
+		Y_one_hot.append(np.array([0,0,1,0,0,0,0]))
+	elif (l ==4):
+		Y_one_hot.append(np.array([0,0,0,1,0,0,0]))
+	elif (l ==5):
+		Y_one_hot.append(np.array([0,0,0,0,1,0,0]))
+	elif (l ==6):
+		Y_one_hot.append(np.array([0,0,0,0,0,1,0]))
+	elif (l ==7):
+		Y_one_hot.append(np.array([0,0,0,0,0,0,1]))
+
+Y_test = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+	idx = np.random.randint(len(X), size=batch_size)
+	batch_X= X[idx,:]
+	batch_Y = Y[idx]
+	return (batch_X,batch_Y)
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = len( Y_train[0] )
+
+## select specific element by index for each row
+def sel_ele_2d(a,b):
+	b= tf.cast(b, tf.int32)
+	b_2 = tf.expand_dims(b, 1)
+	the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1)
+	ind = tf.concat([the_range, b_2],1)
+	res = tf.gather_nd(a, ind)
+	return res
+
+
+# Create the network, tf variables and cost function here. 
+x = tf.placeholder("float64", [None, n_input])
+y = tf.placeholder("float64", [None, n_classes])
+
+#W= tf.Variable(tf.random_normal([n_input, n_classes-1]))
+W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64)
+
+Matrix_Mul=  tf.matmul(x,W)
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) )
+
+T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64)
+
+## (y==c)*e^<x,vc>
+pre_temp= tf.multiply(T,tf.exp(Matrix_concat))
+
+## 1+sigma(e^<x,cb>)
+pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) )
+
+## last-th class prob##
+pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1)
+
+## get the first 6 classes prob
+pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1])
+
+##  concat 6 classes prob with last class prob
+pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1)
+pred_labels_tf = tf.argmax(pred,1);
+
+
+## Our cost function
+cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) )
+
+## Regularization Term Here. 
+#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+
+prefix = ''
+
+if sys.argv[4] == 'GPU': 
+	config = tf.ConfigProto( intra_op_parallelism_threads=1 ); 
+	prefix = 'GPU'
+	#config = tf.ConfigProto( ); 
+else:
+	config = tf.ConfigProto( device_count={'GPU': 0})
+	prefix = 'CPU'
+
+
+# Parameters
+training_epochs = 100
+display_step = 1
+index = 1
+
+# Parameters
+if sys.argv[3] == 'fixed' :
+	batch_size = 128
+else:
+	batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+
+if sys.argv[2].find("raw-data") != -1:
+   #lipschitz constant is 1e-13
+   # this dataset is normalized from the source
+	ll = [1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
+	#rterm = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7]
+	#rterm = [1e-6]
+	rterm = [1]
+	prefix += '_raw_'
+
+else:
+   #lipschitz constant is 1.923
+	ll = [1e-8, 1e-7,1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8 ]
+	#rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+	#rterm = [1e-8]
+	rterm = [1e-3]
+
+	prefix += '_norm_'
+
+
+for lmethod in [ sys.argv[1] ]:
+	for r in rterm:
+		final_cost = cost + r * regularization
+		for learning_rate in ll: 
+			'''
+			if sys.argv[2].find("raw-data") != -1: 
+				outfile = open("raw_" + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+			else:
+				outfile = open("norm_" + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+			'''
+			outfile = open(prefix + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+			index += 1
+			outfile.write("------------------------------------------\n")
+			outfile.write("Method: " +  lmethod + "\n")
+			outfile.write("Step Length: " +  str(learning_rate) + "\n")
+			outfile.write("Regularization: " +  str(r) + "\n")
+			outfile.write("Path: " + sys.argv[2]  + "\n")
+			outfile.write("BatchSize: " + str(batch_size) + "\n");
+			outfile.write("Begin simulation ...... \n");
+
+			if(lmethod =="GD"):
+				optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+			elif(lmethod =="Adadelta"):
+				optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+			elif(lmethod =="Adagrad"):
+				optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+			elif(lmethod =="Adam"):
+				optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+			elif(lmethod =="RMSProp"):
+				optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+			elif(lmethod =="Momentum"):
+				optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+			# Initializing the variables
+			init = tf.global_variables_initializer()
+
+			# Launch the graph
+			with tf.Session(config=config) as sess:
+				sess.run(init)
+				correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+				accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+				outfile.write ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+               			((0), \
+                  		(0), \
+                  		accuracy.eval({x:X_train, y:Y_train})*100., \
+                  		accuracy.eval({x: X_test, y: Y_test})*100., \
+                  		sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                  		sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+				# Training cycle
+				for epoch in range(training_epochs):
+					avg_cost = 0.
+					total_batch = int(len(X_train)/batch_size)
+                  		# Loop over all batches
+
+					start_time = time.time()
+					for i in range(total_batch):
+						batch_x, batch_y = get_batch(X_train,Y_train)
+                     			# Run optimization op (backprop) and cost op (to get loss value)
+						_, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y})
+                     			# Compute average loss
+						avg_cost += c 
+					end_time = time.time ()
+
+                  		# Display logs per epoch step
+					if epoch % display_step == 0:
+                  			# Test model
+                     			# Calculate accuracy
+						correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+						accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+						outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                        			((epoch+1), \
+                           			(end_time-start_time), \
+                           			accuracy.eval({x:X_train, y:Y_train})*100., \
+                           			accuracy.eval({x: X_test, y: Y_test})*100., \
+                           			sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                           			sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+			outfile.write("End of Simulation Here..... \n")
+			outfile.write("\n");
+			outfile.write("\n");
+			outfile.write("\n");
+			outfile.close ()
diff --git a/code/tensorflow/diagnostics/tf_softmax.py b/code/tensorflow/diagnostics/tf_softmax.py
new file mode 100644
index 0000000..fc5e837
--- /dev/null
+++ b/code/tensorflow/diagnostics/tf_softmax.py
@@ -0,0 +1,303 @@
+from __future__ import print_function
+
+import sys
+import tensorflow as tf
+import numpy as np
+import math
+
+import cPickle as pickle
+import time
+import StringIO
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+curpath = sys.argv[2]
+
+print ()
+print (curpath)
+print ()
+print ()
+
+#load the data here. 
+X_train = np.loadtxt(curpath + trainmat ,delimiter=',')
+X_train = X_train.astype(np.float64)
+Y_train = np.loadtxt(curpath + trainvec ,delimiter=',')
+Y_train = Y_train.astype(np.float64)
+
+# Test
+X_test = np.loadtxt(curpath + testmat, delimiter=',')
+X_test = X_test.astype(np.float64)
+Y_test = np.loadtxt(curpath + testvec, delimiter=',')
+Y_teset = Y_test.astype(np.float64)
+
+print ("Done loading data..... ")
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+Y_one_hot=[]
+for l in Y_train: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==11):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1]))
+Y_train = np.array(Y_one_hot)
+
+Y_one_hot=[]
+for l in Y_test: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==11):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1]))
+
+Y_test = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = len( Y_train[0] )
+
+## select specific element by index for each row
+def sel_ele_2d(a,b):
+    b= tf.cast(b, tf.int32)
+    b_2 = tf.expand_dims(b, 1)
+    the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1)
+    ind = tf.concat([the_range, b_2],1)
+    res = tf.gather_nd(a, ind)
+    return res
+
+
+# Create the network, tf variables and cost function here. 
+x = tf.placeholder("float64", [None, n_input])
+y = tf.placeholder("float64", [None, n_classes])
+
+#W= tf.Variable(tf.random_normal([n_input, n_classes-1]))
+W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64)
+
+Matrix_Mul=  tf.matmul(x,W)
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) )
+
+T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64)
+
+## (y==c)*e^<x,vc>
+pre_temp= tf.multiply(T,tf.exp(Matrix_concat))
+
+## 1+sigma(e^<x,cb>)
+pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) )
+
+## last-th class prob##
+pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1)
+
+## get the first 6 classes prob
+pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1])
+
+##  concat 6 classes prob with last class prob
+pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1)
+pred_labels_tf = tf.argmax(pred,1);
+
+
+## Our cost function
+cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) )
+
+## Regularization Term Here. 
+#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+
+prefix = ''
+
+if sys.argv[4] == 'GPU': 
+   config = tf.ConfigProto( intra_op_parallelism_threads=1 );  
+   prefix = 'GPU'
+   #config = tf.ConfigProto( ); 
+else:
+   config = tf.ConfigProto( device_count={'GPU': 0}) 
+   prefix = 'CPU'
+
+
+# Parameters
+training_epochs = 100
+display_step = 1
+index = 0
+
+if sys.argv[3] == 'fixed' :
+   batch_size = 128 
+else:
+   batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+
+if sys.argv[2].find("raw-data") != -1: 
+   #lipschitz constant is 1e-7
+	ll = [1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
+	#rlist = [1e-4]
+	rlist = [1]
+	prefix += '_raw_'
+else:
+   #lipschitz constant is 1e-1
+	ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
+	#rlist = [1e-7]
+	#rlist = [1e-6]
+	rlist = [1e-3]
+	prefix += '_norm_'
+
+
+for lmethod in [ sys.argv[1] ]:
+    for r in rlist:
+
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                outfile = open(prefix + lmethod + "_" + str(index) +  "_readings.txt", "w", 0)
+                index += 1
+
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+                outfile.write("Normalization: " + sys.argv[2] + "\n")
+                outfile.write("Batch Size: "+ str( batch_size ) + "\n") 
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+                elif(lmethod =="Momentum"):
+                    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+
+                    if True:
+                            correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+                            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+                            outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(total_batch):
+                            batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+                            correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+                            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+                            #", cost=","{:.9f}".format(avg_cost), \
+                            outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                            #pred_labels = sess.run( pred_labels_tf, feed_dict={x: X_test, y: Y_test} )
+
+                            #c = [0, 0, 0, 0, 0, 0, 0]
+                            #for i in range(0, len(pred_labels)): 
+                            #    c[ pred_labels[i] ] += 1
+
+                            #for i in range(0, len(c)): 
+                            #    print ("Class: %d --- > %d" % (i, c[ i ]) )
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()
diff --git a/code/tensorflow/gisette/tf_logistic.py b/code/tensorflow/gisette/tf_logistic.py
new file mode 100644
index 0000000..21b79b8
--- /dev/null
+++ b/code/tensorflow/gisette/tf_logistic.py
@@ -0,0 +1,251 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+
+import cPickle as pickle
+import time
+import StringIO
+
+trainmat = 'gisette_train.data'
+trainvec = 'gisette_train.labels01'
+testmat = 'gisette_valid.data'
+testvec = 'gisette_valid.labels01'
+
+curpath = sys.argv[2]
+
+print ()
+print (curpath)
+print ()
+print ()
+
+
+#load the data here. 
+
+# Test
+if sys.argv[2].find("raw-data") != -1:
+    X_train = np.genfromtxt( curpath + trainmat ,delimiter=' ', dtype=None)
+    X_train = X_train.astype(np.float64)
+    Y_train = np.genfromtxt( curpath + trainvec ,delimiter=' ', dtype=None)
+    Y_train = Y_train.astype(np.float64)
+    Y_train = Y_train.reshape( len(Y_train), 1); 
+    X_test = np.genfromtxt( curpath + testmat, delimiter=' ', dtype=None)
+    X_test = X_test.astype(np.float64)
+    Y_test = np.genfromtxt( curpath + testvec, delimiter=' ', dtype=None)
+    Y_test = Y_test.astype(np.float64)
+    Y_test = Y_test.reshape( len( Y_test ), 1 );
+else:
+    X_train = np.genfromtxt( curpath + trainmat ,delimiter=',')
+    X_train = X_train.astype(np.float64)
+    Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',')
+    Y_train = Y_train.astype(np.float64)
+    Y_train = Y_train.reshape( len(Y_train), 1); 
+    X_test = np.loadtxt( curpath + testmat, delimiter=',')
+    X_test = X_test.astype(np.float64)
+    Y_test = np.loadtxt( curpath + testvec, delimiter=',')
+    Y_test = Y_test.astype(np.float64)
+    Y_test = Y_test.reshape( len( Y_test ), 1 );
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+print ()
+print ()
+print(X_train.shape)
+print(Y_train.shape)
+print(X_test.shape)
+print(Y_test.shape)
+print ()
+print ()
+
+
+
+#for label in y:
+
+#    if (label==0):
+#        Y_one_hot.append(np.array([1,0]))
+#    elif (label==1):
+#        Y_one_hot.append(np.array([0,1]))
+#Y_one_hot = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = 2
+
+# Create the network, tf variables and cost function here. 
+x = tf.placeholder("float64", [None, n_input])
+y = tf.placeholder("float64", [None, n_classes - 1])
+
+W= tf.Variable(tf.zeros([n_input, n_classes-1],dtype=tf.float64), dtype=tf.float64)
+
+Matrix_Mul=  tf.matmul(x,W)
+pred = tf.sigmoid(tf.matmul(x, W)) # predictions
+scores = tf.matmul( x, W );
+
+## to prevent overfitting ###
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) 
+
+# Minimize error using cross entropy
+
+
+##  changed to this to prevent overflow
+cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(y, scores)))
+
+## Regularization Term Here. 
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+prefix = ''
+if sys.argv[4] == 'GPU': 
+   config = tf.ConfigProto( intra_op_parallelism_threads=1)
+   prefix += 'GPU'
+else:
+   config = tf.ConfigProto(device_count={'GPU': 0} )
+   prefix += 'CPU'
+
+
+# Parameters
+training_epochs = 100
+if sys.argv[3] == 'fixed' : 
+    batch_size = 128
+else:
+    batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+display_step = 1
+index = 1
+
+
+
+if sys.argv[2].find("raw-data") != -1: 
+    #lipschitz constant is 1e-12
+    ll = [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6] # based on lipschitz constant
+    #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6]
+    #rterm = [1e3]
+    rterm = [1]
+    prefix += '_raw_'
+else:
+    #lipschitz constant is 1e-3
+    ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
+    #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    #rterm = [1e-2]
+    rterm = [1e-3]
+    prefix += '_norm_'
+
+
+for lmethod in [ sys.argv[1] ]:
+
+    for r in rterm: 
+
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                outfile = open(prefix + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+                index += 1
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+                outfile.write("Path: " + sys.argv[2]  + "\n")
+                outfile.write("BatchSize: " + str(batch_size) + "\n"); 
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+                elif(lmethod =="Momentum"):
+                    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+                    ### thresholding , if >0.5 , TRUE, else FALSE
+                    predicted_class = tf.greater(pred,0.5)
+                    correct = tf.equal(predicted_class, tf.equal(y,1.0)) 
+                    accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) 
+
+                    outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                    #import pdb;pdb.set_trace();
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(total_batch):
+                            batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+
+                    
+                            ### thresholding , if >0.5 , TRUE, else FALSE
+                            predicted_class = tf.greater(pred,0.5)
+                            correct = tf.equal(predicted_class, tf.equal(y,1.0)) 
+                            accuracy = tf.reduce_mean( tf.cast(correct, 'float64'))  
+                            outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()
diff --git a/code/tensorflow/ijcnn1/tf_logistic.py b/code/tensorflow/ijcnn1/tf_logistic.py
new file mode 100644
index 0000000..b61c6ab
--- /dev/null
+++ b/code/tensorflow/ijcnn1/tf_logistic.py
@@ -0,0 +1,244 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+import csv
+
+import cPickle as pickle
+import time
+import StringIO
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+curpath = sys.argv[2]
+
+print ()
+print (curpath)
+print ()
+print ()
+
+
+# Test
+if sys.argv[2].find("raw-data") != -1:
+    X_train = np.genfromtxt( curpath + trainmat ,delimiter=',',dtype=None )
+    X_train = X_train.astype(np.float64)
+    Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',',dtype=None)
+    Y_train = Y_train.astype(np.float64)
+    Y_train = Y_train.reshape( len(Y_train), 1); 
+    X_test = np.genfromtxt( curpath + testmat, delimiter=',')
+    X_test = X_test.astype(np.float64)
+    Y_test = np.genfromtxt( curpath + testvec, delimiter=',')
+    Y_test = Y_test.astype(np.float64)
+    Y_test = Y_test.reshape( len( Y_test ), 1 );
+else:
+    X_train = np.genfromtxt( curpath + trainmat ,delimiter=',')
+    X_train = X_train.astype(np.float64)
+    Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',')
+    Y_train = Y_train.astype(np.float64)
+    Y_train = Y_train.reshape( len(Y_train), 1); 
+    X_test = np.loadtxt( curpath + testmat, delimiter=',')
+    Y_test = np.loadtxt( curpath + testvec, delimiter=',')
+    Y_test = Y_test.reshape( len( Y_test ), 1 );
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+print ()
+print ()
+print(X_train.shape)
+print(Y_train.shape)
+print(X_test.shape)
+print(Y_test.shape)
+print ()
+
+
+
+#for label in y:
+
+#    if (label==0):
+#        Y_one_hot.append(np.array([1,0]))
+#    elif (label==1):
+#        Y_one_hot.append(np.array([0,1]))
+#Y_one_hot = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = 2
+
+# Create the network, tf variables and cost function here. 
+x = tf.placeholder("float64", [None, n_input])
+y = tf.placeholder("float64", [None, n_classes - 1])
+
+W= tf.Variable(tf.zeros([n_input, n_classes-1]))
+
+Matrix_Mul=  tf.matmul(x,W)
+pred = tf.sigmoid(tf.matmul(x, W)) # predictions
+scores = tf.matmul( x, W );
+
+## to prevent overfitting ###
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) 
+
+# Minimize error using cross entropy
+
+
+##  changed to this to prevent overflow
+cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(y, scores)))
+
+## Regularization Term Here. 
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+#config = tf.ConfigProto( device_count={'GPU': 0})
+if sys.argv[4] == 'GPU': 
+   config = tf.ConfigProto( intra_op_parallelism_threads=1 );  
+   #config = tf.ConfigProto( ); 
+else:
+   config = tf.ConfigProto( device_count={'GPU': 0}) 
+
+
+# Parameters
+training_epochs = 100
+if sys.argv[3] == 'fixed' : 
+    batch_size = 128
+else:
+    batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+display_step = 1
+index = 1
+
+
+
+if sys.argv[2].find("raw-data") != -1: 
+    #lipschitz constant is 1e-4
+    ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # based on lipschitz constant
+    #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6]
+    #rterm = [1e-2]
+    rterm = [1e-6]
+else:
+    #lipschitz constant is 10
+    ll = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6]
+    #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    rterm = [1e-7]
+
+
+for lmethod in [ sys.argv[1] ]:
+
+    for r in rterm: 
+
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                if sys.argv[2].find("raw-data") != -1: 
+                	outfile = open("raw_" + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+                else: 
+                	outfile = open("norm_" + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+                index += 1
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+                outfile.write("Path: " + sys.argv[2]  + "\n")
+                outfile.write("BatchSize: " + str(batch_size) + "\n"); 
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+                    ### thresholding , if >0.5 , TRUE, else FALSE
+                    predicted_class = tf.greater(pred,0.5)
+                    correct = tf.equal(predicted_class, tf.equal(y,1.0)) 
+                    accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) 
+
+                    outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                    #import pdb;pdb.set_trace();
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(total_batch):
+                            batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+
+                    
+                            ### thresholding , if >0.5 , TRUE, else FALSE
+                            predicted_class = tf.greater(pred,0.5)
+                            correct = tf.equal(predicted_class, tf.equal(y,1.0)) 
+                            accuracy = tf.reduce_mean( tf.cast(correct, 'float64'))  
+                            outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()
diff --git a/code/tensorflow/mnist/tf_softmax.py b/code/tensorflow/mnist/tf_softmax.py
new file mode 100644
index 0000000..559be11
--- /dev/null
+++ b/code/tensorflow/mnist/tf_softmax.py
@@ -0,0 +1,274 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+
+import cPickle as pickle
+import time
+import StringIO
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+curpath = sys.argv[2]
+
+#load the data here. 
+X_train = np.loadtxt(curpath + trainmat ,delimiter=',')
+X_train = X_train.astype(np.float64)
+Y_train = np.loadtxt(curpath + trainvec ,delimiter=',')
+Y_train = Y_train.astype(np.float64)
+
+Y_train += 1
+
+# Test
+X_test = np.loadtxt(curpath + testmat, delimiter=',')
+X_test = X_test.astype(np.float64)
+Y_test = np.loadtxt(curpath + testvec, delimiter=',')
+Y_test = Y_test.astype(np.float64)
+
+Y_test += 1
+
+print ("Done loading data..... ")
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+Y_one_hot=[]
+for l in Y_train: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1]))
+Y_train = np.array(Y_one_hot)
+
+Y_one_hot=[]
+for l in Y_test: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1]))
+Y_test = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+	idx = np.random.randint(len(X), size=batch_size)
+	batch_X= X[idx,:]
+	batch_Y = Y[idx]
+	return (batch_X,batch_Y)
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = len( Y_train[0] )
+
+## select specific element by index for each row
+def sel_ele_2d(a,b):
+	b= tf.cast(b, tf.int32)
+	b_2 = tf.expand_dims(b, 1)
+	the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1)
+	ind = tf.concat([the_range, b_2],1)
+	res = tf.gather_nd(a, ind)
+	return res
+
+
+# Create the network, tf variables and cost function here. 
+x = tf.placeholder("float64", [None, n_input])
+y = tf.placeholder("float64", [None, n_classes])
+
+#W= tf.Variable(tf.random_normal([n_input, n_classes-1]))
+W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64)
+
+Matrix_Mul=  tf.matmul(x,W)
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) )
+
+T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64)
+
+## (y==c)*e^<x,vc>
+pre_temp= tf.multiply(T,tf.exp(Matrix_concat))
+
+## 1+sigma(e^<x,cb>)
+pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) )
+
+## last-th class prob##
+pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1)
+
+## get the first 6 classes prob
+pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1])
+
+##  concat 6 classes prob with last class prob
+pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1)
+pred_labels_tf = tf.argmax(pred,1);
+
+
+## Our cost function
+cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) )
+
+## Regularization Term Here. 
+#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+#	config = tf.ConfigProto( intra_op_parallelism_threads=1, device_count={'GPU': 1})
+
+prefix = ''
+if sys.argv[4] == 'GPU': 
+	config = tf.ConfigProto( intra_op_parallelism_threads=1)
+	prefix += 'GPU'
+else:
+	config = tf.ConfigProto(device_count={'GPU': 0} )
+	prefix += 'CPU'
+
+
+# Parameters
+training_epochs = 100
+display_step = 1
+index = 1
+
+# Parameters
+if sys.argv[3] == 'fixed' :
+	batch_size = 128
+else:
+	batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+
+if sys.argv[2].find("raw-data") != -1:
+   #lipschitz constant is 1e-11
+   # this dataset is normalized from the source
+	ll = [1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e1, 1e2, 1e3 ]
+	#rterm = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
+	#rterm = [1e1]
+	#rterm = [1e2]
+	rterm = [1]
+	prefix += '_raw_'
+
+else:
+   #lipschitz constant is 1e-2
+	ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6 ]
+	#rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+	#rterm = [1e-4]
+	rterm = [1e-3]
+	prefix += '_norm_'
+
+
+
+for lmethod in [ sys.argv[1] ]:
+	for r in rterm:
+		final_cost = cost + r * regularization
+		for learning_rate in ll: 
+			outfile = open(prefix + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+			index += 1
+			outfile.write("------------------------------------------\n")
+			outfile.write("Method: " +  lmethod + "\n")
+			outfile.write("Step Length: " +  str(learning_rate) + "\n")
+			outfile.write("Regularization: " +  str(r) + "\n")
+			outfile.write("Path: " + sys.argv[2]  + "\n")
+			outfile.write("BatchSize: " + str(batch_size) + "\n");
+			outfile.write("Begin simulation ...... \n");
+
+			if(lmethod =="GD"):
+				optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+			elif(lmethod =="Adadelta"):
+				optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+			elif(lmethod =="Adagrad"):
+				optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+			elif(lmethod =="Adam"):
+				optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+			elif(lmethod =="RMSProp"):
+				optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+			elif(lmethod =="Momentum"):
+				optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+			# Initializing the variables
+			init = tf.global_variables_initializer()
+
+			# Launch the graph
+			with tf.Session(config=config) as sess:
+				sess.run(init)
+				correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+				accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+				outfile.write ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+               			((0), \
+                  		(0), \
+                  		accuracy.eval({x:X_train, y:Y_train})*100., \
+                  		accuracy.eval({x: X_test, y: Y_test})*100., \
+                  		sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                  		sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+				# Training cycle
+				for epoch in range(training_epochs):
+					avg_cost = 0.
+					total_batch = int(len(X_train)/batch_size)
+                  		# Loop over all batches
+
+					start_time = time.time()
+					for i in range(total_batch):
+						batch_x, batch_y = get_batch(X_train,Y_train)
+                     			# Run optimization op (backprop) and cost op (to get loss value)
+						_, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y})
+                     			# Compute average loss
+						avg_cost += c 
+					end_time = time.time ()
+
+                  		# Display logs per epoch step
+					if epoch % display_step == 0:
+                  			# Test model
+                     			# Calculate accuracy
+						correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) )
+						accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+						outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                        			((epoch+1), \
+                           			(end_time-start_time), \
+                           			accuracy.eval({x:X_train, y:Y_train})*100., \
+                           			accuracy.eval({x: X_test, y: Y_test})*100., \
+                           			sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                           			sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+			outfile.write("End of Simulation Here..... \n")
+			outfile.write("\n");
+			outfile.write("\n");
+			outfile.write("\n");
+			outfile.close ()
diff --git a/code/tensorflow/newsgroups/tf_softmax.py b/code/tensorflow/newsgroups/tf_softmax.py
new file mode 100644
index 0000000..24cdf15
--- /dev/null
+++ b/code/tensorflow/newsgroups/tf_softmax.py
@@ -0,0 +1,433 @@
+from __future__ import print_function
+
+import sys
+import tensorflow as tf
+import numpy as np
+
+import cPickle as pickle
+import time
+import StringIO
+
+import scipy.sparse as sparse
+import math
+
+def getShape( entries ): 
+
+    mrow = 0
+    mcol = 0
+
+    for item in entries: 
+        if mrow < item[0]: 
+            mrow = item[0]
+        if mcol < item[1]: 
+            mcol = item[1]
+    return mrow, mcol
+
+def getDenseMatrix(entries, rows, cols): 
+    rowIdx = np.empty([len(entries)], dtype=int)
+    colIdx = np.empty([len(entries)], dtype=int)
+    val = np.empty([len(entries)], dtype=np.float64)
+
+    for idx,item in enumerate(entries): 
+        rowIdx[ idx ] = item[0] - 1
+        colIdx[ idx ] = item[1] - 1
+        val[ idx ] = item[2]
+
+    return sparse.csr_matrix( (val, (rowIdx, colIdx)), shape=(rows, cols) ).toarray ()
+        
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+
+#load the data here. 
+curpath = sys.argv[2]
+
+print ()
+print (curpath)
+print ()
+print ()
+
+
+X_train = np.loadtxt(curpath + trainmat ,delimiter=',')
+X_train = X_train.astype(np.float64)
+Y_train = np.loadtxt(curpath + trainvec ,delimiter=',')
+Y_train = Y_train.astype(np.float64)
+
+# Test
+X_test = np.loadtxt(curpath + testmat, delimiter=',')
+X_test = X_test.astype(np.float64)
+Y_test = np.loadtxt(curpath + testvec, delimiter=',')
+Y_test = Y_test.astype(np.float64)
+
+# Convert to the usable format here. 
+
+x_row, x_col = getShape( X_train )
+y_row, y_col = getShape( X_test )
+
+shapey = 0
+
+if x_col < y_col: 
+    shapey = y_col
+else: 
+    shapey = x_col
+
+print ("Done loading data..... ")
+print (X_train.shape)
+print (Y_train.shape)
+print (X_test.shape)
+print (Y_test.shape)
+print (Y_test)
+print (Y_train)
+
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+Y_one_hot=[]
+for l in Y_train: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]))
+    if (l == 11): 
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==12):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==13):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==14):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==15):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==16):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==17):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==18):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==19):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==20):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]))
+Y_train = np.array(Y_one_hot)
+
+Y_one_hot=[]
+for l in Y_test: 
+    if (l == 1): 
+        Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==2):
+        Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==3):
+        Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==4):
+        Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==5):
+        Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==6):
+        Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==7):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==8):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==9):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]))
+    elif (l ==10):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]))
+    if (l == 11): 
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]))
+    elif (l ==12):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]))
+    elif (l ==13):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]))
+    elif (l ==14):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]))
+    elif (l ==15):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]))
+    elif (l ==16):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]))
+    elif (l ==17):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]))
+    elif (l ==18):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]))
+    elif (l ==19):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]))
+    elif (l ==20):
+        Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]))
+Y_test = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+## shuffle data points ##
+def shuffle(X,Y):
+    idx = np.random.randint(len(X), size=len(X))
+    X = X[idx,:]
+    Y = Y[idx]
+    return (X,Y)
+
+X_train = getDenseMatrix( X_train, x_row, shapey )
+X_test = getDenseMatrix( X_test, y_row, shapey )
+
+X_train,Y_train=shuffle(X_train,Y_train)
+
+#import pdb; pdb.set_trace();
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = len( Y_train[0] )
+
+## select specific element by index for each row
+def sel_ele_2d(a,b):
+    b= tf.cast(b, tf.int32)
+    b_2 = tf.expand_dims(b, 1)
+    the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1)
+    ind = tf.concat([the_range, b_2],1)
+    res = tf.gather_nd(a, ind)
+    return res
+
+
+# Create the network, tf variables and cost function here. 
+#x = tf.placeholder("float", [None, n_input])
+#y = tf.placeholder("float", [None, n_classes])
+
+x=tf.sparse_placeholder(tf.float64)
+y=tf.sparse_placeholder(tf.float64)
+W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64)
+
+#Matrix_Mul=  tf.matmul(x,W)
+Matrix_Mul= tf.sparse_tensor_dense_matmul(x,W)
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) )
+
+T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64)
+
+## (y==c)*e^<x,vc>
+pre_temp= tf.multiply(T,tf.exp(Matrix_concat))
+
+## 1+sigma(e^<x,cb>)
+pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) )
+
+## last-th class prob##
+pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1)
+
+## get the first 6 classes prob
+pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1])
+
+##  concat 6 classes prob with last class prob
+pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1)
+pred_labels_tf = tf.argmax(pred,1);
+
+
+## Our cost function
+cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(tf.sparse_tensor_to_dense(y),1) ),1) )
+
+## Regularization Term Here. 
+#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.)
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+
+prefix = ''
+if sys.argv[4] == 'GPU': 
+   config = tf.ConfigProto( intra_op_parallelism_threads=1)
+   prefix += 'GPU'
+else:
+   config = tf.ConfigProto(device_count={'GPU': 0} )
+   prefix += 'CPU'
+
+
+# Parameters
+training_epochs = 100
+display_step = 1
+index = 0
+
+# Parameters
+if sys.argv[3] == 'fixed' :
+   batch_size = 128 
+else:
+   batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+
+with tf.Session() as sess:
+    ### creating sparse_tensor for test x, test y ###
+
+    x_t= tf.placeholder("float64", [None, n_input])
+    #x_t = tf.constant(X_test)
+    idx_x = tf.where(tf.not_equal(x_t, 0))
+    sparse_Test_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t, idx_x), tf.cast(tf.shape(x_t),tf.int64))
+
+    #sparse_X_test=sess.run([sparse_Test_x],feed_dict={x_t:X_test})
+    #import pdb;pdb.set_trace();
+    y_t= tf.placeholder("float64", [None, n_classes])
+
+    #y_t = tf.constant(Y_test)
+    idx_y = tf.where(tf.not_equal(y_t, 0))
+    sparse_Test_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t, idx_y),tf.cast(tf.shape(y_t),tf.int64))
+    ### creating batch list ###
+    x_t2 =   tf.placeholder("float64", [None, n_input])  #tf.constant(X_train)
+    idx_x = tf.where(tf.not_equal(x_t2, 0))
+    sparse_Train_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t2, idx_x), tf.cast(tf.shape(x_t2),tf.int64) )
+    sparse_Train_x_list=tf.sparse_split(sp_input=sparse_Train_x,axis=0,num_split=int (np.floor( len(X_train)/batch_size )))
+    y_t2 =  tf.placeholder("float64", [None, n_classes])   #tf.constant(Y_train)
+    idx_y = tf.where(tf.not_equal(y_t2, 0))
+    sparse_Train_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t2, idx_y), tf.cast(tf.shape(y_t2),tf.int64)) 
+    sparse_Train_y_list=tf.sparse_split(sp_input=sparse_Train_y,axis=0,num_split=int (np.floor( len(X_train)/batch_size )))
+
+    X_test,Y_test,X_train,Y_train,batch_x_list,batch_y_list = sess.run([sparse_Test_x,sparse_Test_y,sparse_Train_x,sparse_Train_y,sparse_Train_x_list,sparse_Train_y_list ],feed_dict={x_t: X_test, y_t: Y_test,x_t2: X_train,y_t2: Y_train})    
+
+
+if sys.argv[2].find("raw-data") != -1:
+	#raw
+	ll = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2 ]
+	#rlist = [1e-3]
+	rlist = [1]
+	prefix += '_raw_'
+else: 
+	#normalized
+	ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2 ]
+	#rlist = [1e-1]
+	rlist = [1e-3]
+	prefix += '_norm_'
+	#print ("This is NOT Working at the moment..... ")
+	#exit ()	
+
+for lmethod in [ sys.argv[1] ]:
+    for r in rlist:
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                outfile = open(prefix + lmethod  +"_" + str(index) + "_readings.txt", "w", 0)
+                index += 1
+
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+                outfile.write("Normalization: " + sys.argv[2] +"\n")
+                outfile.write("Batch Size: "+ str(batch_size) +"\n")
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+                elif(lmethod =="Momentum"):
+                    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+
+                    if True:
+                            correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(tf.sparse_tensor_to_dense(y),1) )
+                            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+                            outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                            print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(len(batch_x_list)):
+                            #batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x_list[i],y: batch_y_list[i]})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+                            correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(tf.sparse_tensor_to_dense(y),1) )
+                            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64"))
+                            #", cost=","{:.9f}".format(avg_cost), \
+                            outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                            #pred_labels = sess.run( pred_labels_tf, feed_dict={x: X_test, y: Y_test} )
+
+                            #c = [0, 0, 0, 0, 0, 0, 0]
+                            #for i in range(0, len(pred_labels)): 
+                            #    c[ pred_labels[i] ] += 1
+
+                            #for i in range(0, len(c)): 
+                            #    print ("Class: %d --- > %d" % (i, c[ i ]) )
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()
diff --git a/code/tensorflow/rcv1/tf_logistic.py b/code/tensorflow/rcv1/tf_logistic.py
new file mode 100644
index 0000000..f710191
--- /dev/null
+++ b/code/tensorflow/rcv1/tf_logistic.py
@@ -0,0 +1,316 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+
+import scipy.sparse as sparse
+
+import cPickle as pickle
+import time
+import StringIO
+
+def getShape( entries ):
+    mrow = 0
+    mcol = 0
+    for item in entries:
+        if mrow < item[0]:
+            mrow = item[0]
+        if mcol < item[1]:
+            mcol = item[1]
+    return mrow, mcol
+
+def getDenseMatrix(entries, rows, cols):
+    rowIdx = np.empty([len(entries)], dtype=int)
+    colIdx = np.empty([len(entries)], dtype=int)
+    val = np.empty([len(entries)], dtype=float)
+
+    print( rows ); 
+    print( cols ); 
+
+    for idx,item in enumerate(entries):
+        rowIdx[ idx ] = item[0] - 1
+        colIdx[ idx ] = item[1] - 1
+        val[ idx ] = item[2]
+    return sparse.csr_matrix( (val, (rowIdx, colIdx)), shape=(rows, cols) ).toarray ()
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+curpath = sys.argv[2]
+
+print ()
+print (curpath)
+print ()
+print ()
+
+#load the data here. 
+X_train = np.genfromtxt( curpath + trainmat ,delimiter=',', dtype=None)
+Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',', dtype=None)
+Y_train = Y_train.astype(np.float64)
+Y_train = Y_train.reshape( len(Y_train), 1); 
+
+# Test
+X_test = np.genfromtxt( curpath + testmat, delimiter=',', dtype=None)
+Y_test = np.genfromtxt( curpath + testvec, delimiter=',', dtype=None)
+Y_test = Y_test.astype(np.float64)
+Y_test = Y_test.reshape( len( Y_test ), 1 ); 
+
+# Convert to the usable format here. 
+
+x_row, x_col = getShape( X_train )
+y_row, y_col = getShape( X_test )
+
+shapey = 0
+
+if x_col < y_col:
+    shapey = y_col
+else:
+    shapey = x_col
+
+
+X_train = getDenseMatrix( X_train, x_row, shapey )
+X_test = getDenseMatrix( X_test, y_row, shapey )
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+print ()
+print ()
+print(X_train.shape)
+print(Y_train.shape)
+print(X_test.shape)
+print(Y_test.shape)
+print ()
+print ()
+
+
+
+#for label in y:
+
+#    if (label==0):
+#        Y_one_hot.append(np.array([1,0]))
+#    elif (label==1):
+#        Y_one_hot.append(np.array([0,1]))
+#Y_one_hot = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+
+## shuffle data points ##
+def shuffle(X,Y):
+    idx = np.random.randint(len(X), size=len(X))
+    X = X[idx,:]
+    Y = Y[idx]
+    return (X,Y)
+
+X_train,Y_train=shuffle(X_train,Y_train)
+
+
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = 2
+
+# Create the network, tf variables and cost function here. 
+#x = tf.placeholder("float", [None, n_input])
+#y = tf.placeholder("float", [None, n_classes - 1])
+
+x=tf.sparse_placeholder(tf.float32)
+y=tf.sparse_placeholder(tf.float32)
+
+
+
+
+W= tf.Variable(tf.zeros([n_input, n_classes-1]))
+
+
+Matrix_Mul= tf.sparse_tensor_dense_matmul(x,W)
+pred = tf.sigmoid(tf.sparse_tensor_dense_matmul(x, W)) # predictions
+scores = tf.sparse_tensor_dense_matmul( x, W );
+
+#Matrix_Mul=  tf.matmul(x,W)
+#pred = tf.sigmoid(tf.matmul(x, W)) # predictions
+#scores = tf.matmul( x, W );
+
+## to prevent overfitting ###
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float32)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) 
+
+# Minimize error using cross entropy
+
+
+##  changed to this to prevent overflow
+cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(tf.sparse_tensor_to_dense(y), scores)))
+
+#cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(tf.sparse_tensor_to_dense(y),1) ),1) )
+## Regularization Term Here. 
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+config = tf.ConfigProto( device_count={'GPU': 0})
+
+
+# Parameters
+training_epochs = 100
+if sys.argv[3] == 'fixed' : 
+    batch_size = 128
+else:
+    batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+display_step = 1
+index = 1
+
+
+with tf.Session() as sess:
+
+    x_t= tf.placeholder("float", [None, n_input])
+    #x_t = tf.constant(X_test)
+    idx_x = tf.where(tf.not_equal(x_t, 0))
+    sparse_Test_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t, idx_x), tf.cast(tf.shape(x_t),tf.int64))
+
+    #sparse_X_test=sess.run([sparse_Test_x],feed_dict={x_t:X_test})
+    #import pdb;pdb.set_trace();
+    y_t= tf.placeholder("float", [None, n_classes-1])
+
+    #y_t = tf.constant(Y_test)
+    idx_y = tf.where(tf.not_equal(y_t, 0))
+    sparse_Test_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t, idx_y),tf.cast(tf.shape(y_t),tf.int64))
+    ### creating batch list ###
+    x_t2 =   tf.placeholder("float", [None, n_input])  #tf.constant(X_train)
+    idx_x = tf.where(tf.not_equal(x_t2, 0))
+    sparse_Train_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t2, idx_x), tf.cast(tf.shape(x_t2),tf.int64) )
+    sparse_Train_x_list=tf.sparse_split(sp_input=sparse_Train_x,axis=0,num_split=int (np.floor( len(X_train)/batch_size )))
+    y_t2 =  tf.placeholder("float", [None, n_classes-1])   #tf.constant(Y_train)
+    idx_y = tf.where(tf.not_equal(y_t2, 0))
+    sparse_Train_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t2, idx_y), tf.cast(tf.shape(y_t2),tf.int64)) 
+    sparse_Train_y_list=tf.sparse_split(sp_input=sparse_Train_y,axis=0,num_split=int (np.floor( len(X_train)/batch_size )))
+
+    X_test,Y_test,X_train,Y_train,batch_x_list,batch_y_list = sess.run([sparse_Test_x,sparse_Test_y,sparse_Train_x,sparse_Train_y,sparse_Train_x_list,sparse_Train_y_list ],feed_dict={x_t: X_test, y_t: Y_test,x_t2: X_train,y_t2: Y_train})    
+
+#import pdb; pdb.set_trace();
+
+
+
+
+if sys.argv[2].find("raw-data") != -1:
+    #lipschitz constant is 1e-12
+    ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
+    rterm = [1e-1]
+else:
+    #lipschitz constant is 1e-3
+    ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
+    rterm = [1e-2]
+
+
+for lmethod in [ sys.argv[1] ]:
+
+    for r in rterm: 
+
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                outfile = open(lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+                index += 1
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+                outfile.write("Path: " + sys.argv[2]  + "\n")
+                outfile.write("BatchSize: " + str(batch_size) + "\n"); 
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+                    ### thresholding , if >0.5 , TRUE, else FALSE
+                    predicted_class = tf.greater(pred,0.5)
+                    correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) 
+                    accuracy = tf.reduce_mean( tf.cast(correct, 'float')) 
+
+                    outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                    #import pdb;pdb.set_trace();
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        #total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(len(batch_x_list )):
+                            #batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x_list[i],y: batch_y_list[i]})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+
+                    
+                            ### thresholding , if >0.5 , TRUE, else FALSE
+                            predicted_class = tf.greater(pred,0.5)
+                            correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) 
+                            accuracy = tf.reduce_mean( tf.cast(correct, 'float'))  
+                            outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()
diff --git a/code/tensorflow/real-sim/tf_logistic.py b/code/tensorflow/real-sim/tf_logistic.py
new file mode 100644
index 0000000..8720f61
--- /dev/null
+++ b/code/tensorflow/real-sim/tf_logistic.py
@@ -0,0 +1,332 @@
+from __future__ import print_function
+
+import sys
+import math
+import tensorflow as tf
+import numpy as np
+
+import scipy.sparse as sparse
+
+import cPickle as pickle
+import time
+import StringIO
+
+def getShape( entries ):
+    mrow = 0
+    mcol = 0
+    for item in entries:
+        if mrow < item[0]:
+            mrow = item[0]
+        if mcol < item[1]:
+            mcol = item[1]
+    return mrow, mcol
+
+def getDenseMatrix(entries, rows, cols):
+    rowIdx = np.empty([len(entries)], dtype=int)
+    colIdx = np.empty([len(entries)], dtype=int)
+    val = np.empty([len(entries)], dtype=np.float64)
+
+    print( rows ); 
+    print( cols ); 
+
+    for idx,item in enumerate(entries):
+        rowIdx[ idx ] = item[0] - 1
+        colIdx[ idx ] = item[1] - 1
+        val[ idx ] = item[2]
+    return sparse.csr_matrix( (val, (rowIdx, colIdx)), shape=(rows, cols) ).toarray ()
+
+trainmat = 'train_mat.txt'
+trainvec = 'train_vec.txt'
+testmat = 'test_mat.txt'
+testvec = 'test_vec.txt'
+
+curpath = sys.argv[2]
+
+print ()
+print (curpath)
+print ()
+print ()
+
+#load the data here. 
+X_train = np.genfromtxt( curpath + trainmat ,delimiter=',', dtype=None)
+Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',', dtype=None)
+Y_train = Y_train.astype(np.float64)
+Y_train = Y_train.reshape( len(Y_train), 1); 
+
+# Test
+X_test = np.genfromtxt( curpath + testmat, delimiter=',', dtype=None)
+Y_test = np.genfromtxt( curpath + testvec, delimiter=',', dtype=None)
+Y_test = Y_test.astype(np.float64)
+Y_test = Y_test.reshape( len( Y_test ), 1 ); 
+
+# Convert to the usable format here. 
+
+x_row, x_col = getShape( X_train )
+y_row, y_col = getShape( X_test )
+
+shapey = 0
+
+if x_col < y_col:
+    shapey = y_col
+else:
+    shapey = x_col
+
+
+X_train = getDenseMatrix( X_train, x_row, shapey )
+X_test = getDenseMatrix( X_test, y_row, shapey )
+
+# fix random seed for reproducibility
+seed = 7
+np.random.seed(seed)
+
+print ()
+print ()
+print(X_train.shape)
+print(Y_train.shape)
+print(X_test.shape)
+print(Y_test.shape)
+print ()
+print ()
+
+
+
+#for label in y:
+
+#    if (label==0):
+#        Y_one_hot.append(np.array([1,0]))
+#    elif (label==1):
+#        Y_one_hot.append(np.array([0,1]))
+#Y_one_hot = np.array(Y_one_hot)
+
+
+
+## get BATCH_SIZE data points ##
+def get_batch(X,Y):
+    idx = np.random.randint(len(X), size=batch_size)
+    batch_X= X[idx,:]
+    batch_Y = Y[idx]
+    return (batch_X,batch_Y)
+
+## shuffle data points ##
+def shuffle(X,Y):
+    idx = np.random.randint(len(X), size=len(X))
+    X = X[idx,:]
+    Y = Y[idx]
+    return (X,Y)
+
+X_train,Y_train=shuffle(X_train,Y_train)
+
+
+
+# Network Parameters
+n_input = X_train.shape[1]
+n_classes = 2
+
+# Create the network, tf variables and cost function here. 
+#x = tf.placeholder("float", [None, n_input])
+#y = tf.placeholder("float", [None, n_classes - 1])
+
+x=tf.sparse_placeholder(tf.float64)
+y=tf.sparse_placeholder(tf.float64)
+
+
+
+
+W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64)
+
+
+Matrix_Mul= tf.sparse_tensor_dense_matmul(x,W)
+pred = tf.sigmoid(tf.sparse_tensor_dense_matmul(x, W)) # predictions
+scores = tf.sparse_tensor_dense_matmul( x, W );
+
+#Matrix_Mul=  tf.matmul(x,W)
+#pred = tf.sigmoid(tf.matmul(x, W)) # predictions
+#scores = tf.matmul( x, W );
+
+## to prevent overfitting ###
+Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64)
+Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1)
+Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1)
+Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) 
+
+# Minimize error using cross entropy
+
+
+##  changed to this to prevent overflow
+cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(tf.sparse_tensor_to_dense(y), scores)))
+
+#cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(tf.sparse_tensor_to_dense(y),1) ),1) )
+## Regularization Term Here. 
+regularization = tf.nn.l2_loss(W)
+
+
+#  Tensorflow built in cost function
+#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
+prefix = ''
+if sys.argv[4] == 'GPU': 
+   config = tf.ConfigProto( intra_op_parallelism_threads=1)
+   config.gpu_options.allow_growth = True
+   prefix += 'GPU'
+else:
+   config = tf.ConfigProto(device_count={'GPU': 0} )
+   prefix += 'CPU'
+
+
+# Parameters
+training_epochs = 100
+if sys.argv[3] == 'fixed' : 
+    batch_size = 128
+else:
+    batch_size = int (math.floor( len(X_train) * 0.2 ))
+
+display_step = 1
+index = 1
+
+
+with tf.Session(config=config) as sess:
+
+    x_t= tf.placeholder("float64", [None, n_input])
+    #x_t = tf.constant(X_test)
+    idx_x = tf.where(tf.not_equal(x_t, 0))
+    sparse_Test_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t, idx_x), tf.cast(tf.shape(x_t),tf.int64))
+
+    #sparse_X_test=sess.run([sparse_Test_x],feed_dict={x_t:X_test})
+    #import pdb;pdb.set_trace();
+    y_t= tf.placeholder("float64", [None, n_classes-1])
+
+    #y_t = tf.constant(Y_test)
+    idx_y = tf.where(tf.not_equal(y_t, 0))
+    sparse_Test_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t, idx_y),tf.cast(tf.shape(y_t),tf.int64))
+    ### creating batch list ###
+    x_t2 =   tf.placeholder("float64", [None, n_input])  #tf.constant(X_train)
+    idx_x = tf.where(tf.not_equal(x_t2, 0))
+    sparse_Train_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t2, idx_x), tf.cast(tf.shape(x_t2),tf.int64) )
+    sparse_Train_x_list=tf.sparse_split(sp_input=sparse_Train_x,axis=0,num_split=int (np.floor( len(X_train)/batch_size )))
+    y_t2 =  tf.placeholder("float64", [None, n_classes-1])   #tf.constant(Y_train)
+    idx_y = tf.where(tf.not_equal(y_t2, 0))
+    sparse_Train_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t2, idx_y), tf.cast(tf.shape(y_t2),tf.int64)) 
+    sparse_Train_y_list=tf.sparse_split(sp_input=sparse_Train_y,axis=0,num_split=int (np.floor( len(X_train)/batch_size )))
+
+    X_test,Y_test,X_train,Y_train,batch_x_list,batch_y_list = sess.run([sparse_Test_x,sparse_Test_y,sparse_Train_x,sparse_Train_y,sparse_Train_x_list,sparse_Train_y_list ],feed_dict={x_t: X_test, y_t: Y_test,x_t2: X_train,y_t2: Y_train})    
+
+#import pdb; pdb.set_trace();
+
+
+
+
+if sys.argv[2].find("raw-data") != -1:
+    #lipschitz constant is 1e-12
+    # this dataset is normalized from the source
+    #no need to run the raw-data set of runs. 
+    #ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
+    #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+#	 print 'This is NOT DEFINED FOR THIS DATASET... '
+	 exit ()
+else:
+    #lipschitz constant is 1e-3
+    ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
+    #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    #rterm = [1e-2]
+    rterm = [1e-3]
+    prefix += '_norm_'
+
+
+for lmethod in [ sys.argv[1] ]:
+
+    for r in rterm: 
+
+            final_cost = cost + r * regularization
+        
+            for learning_rate in ll: 
+
+                outfile = open(prefix + lmethod  + "_" + str(index) + "_readings.txt", "w", 0)
+                index += 1
+                outfile.write("------------------------------------------\n")
+                outfile.write("Method: " +  lmethod + "\n")
+                outfile.write("Step Length: " +  str(learning_rate) + "\n")
+                outfile.write("Regularization: " +  str(r) + "\n")
+                outfile.write("Path: " + sys.argv[2]  + "\n")
+                outfile.write("BatchSize: " + str(batch_size) + "\n"); 
+            
+                outfile.write("Begin simulation ...... \n");
+
+                if(lmethod =="GD"):
+                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost)
+                elif(lmethod =="Adadelta"):
+                    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost)
+                elif(lmethod =="Adagrad"):
+                    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost)
+                elif(lmethod =="Adam"):
+                    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost)
+                elif(lmethod =="RMSProp"):
+                    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost)
+                elif(lmethod =="Momentum"):
+                    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost)
+
+                # Initializing the variables
+                init = tf.global_variables_initializer()
+
+                # Launch the graph
+                with tf.Session(config=config) as sess:
+                #with tf.Session() as sess:
+                    sess.run(init)
+                    ### thresholding , if >0.5 , TRUE, else FALSE
+                    predicted_class = tf.greater(pred,0.5)
+                    correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) 
+                    accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) 
+
+                    outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((0), \
+                                            (0), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                    #import pdb;pdb.set_trace();
+                    # Training cycle
+                    for epoch in range(training_epochs):
+                        avg_cost = 0.
+                        #total_batch = int(len(X_train)/batch_size)
+                        # Loop over all batches
+
+                        start_time = time.time()
+                        for i in range(len(batch_x_list )):
+                            #batch_x, batch_y = get_batch(X_train,Y_train)
+                            # Run optimization op (backprop) and cost op (to get loss value)
+                            _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x_list[i],y: batch_y_list[i]})
+                            # Compute average loss
+                            avg_cost += c 
+
+                        end_time = time.time ()
+
+                        # Display logs per epoch step
+                        if epoch % display_step == 0:
+                            # Test model
+                            # Calculate accuracy
+
+                    
+                            ### thresholding , if >0.5 , TRUE, else FALSE
+                            predicted_class = tf.greater(pred,0.5)
+                            correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) 
+                            accuracy = tf.reduce_mean( tf.cast(correct, 'float64'))  
+                            outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+                            print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \
+                                            ((epoch+1), \
+                                            (end_time-start_time), \
+                                            accuracy.eval({x:X_train, y:Y_train})*100., \
+                                            accuracy.eval({x: X_test, y: Y_test})*100., \
+                                            sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \
+                                            sess.run(final_cost, feed_dict={x: X_test, y: Y_test})))
+
+                outfile.write("End of Simulation Here..... \n")
+                outfile.write("\n");
+                outfile.write("\n");
+                outfile.write("\n");
+
+                outfile.close ()