diff --git a/code/cuda/RC-FINAL-5/Makefile b/code/cuda/RC-FINAL-5/Makefile new file mode 100644 index 0000000..814bc14 --- /dev/null +++ b/code/cuda/RC-FINAL-5/Makefile @@ -0,0 +1,73 @@ +CC = g++ +NVCC = nvcc +LIBS = -L/usr/local/cuda-8.0/lib64 -lm -lz -lcuda -lcudart -lcublas -lcusparse -lcurand -lpthread -m64 +NVCCFLAGS = -I. -I/usr/local/cuda-8.0/include -arch=sm_60 +CFLAGS = -I. -I/usr/local/cuda-8.0/include -Wall -funroll-loops -fstrict-aliasing -O3 +DEBUGFLAGS = -D__debug__ -D__STATISTICS__ + +DEFS = $(CFLAGS) $(DEBUGFLAGS) +NVCCDEFS = $(NVCCFLAGS) $(DEBUGFLAGS) +FLAG = $(DEFS) $(INCS) $(LIBS) +NVCCFLAG = $(NVCCDEFS) $(LIBS) + +OBJ = utils.o dataset.o cuda_utils.o logistic_fn_indicator.o \ + mat_functions.o cuda_environment.o linesearch.o \ + conjugate_gradient.o newton_cg.o newton-driver.o print_utils.o \ + softmax_multiclass.o gen_random.o sparse_dataset.o \ + subsampling_helpers.o classification_kernels.o + +all: beta +beta: $(OBJ) Makefile + $(NVCC) $(OBJ) -o NewtonCGSolver $(NVCCFLAG) + +utils.o: utils.h + $(CC) $(DEFS) -c utils.c + +dataset.o: dataset.h + $(CC) $(DEFS) -c dataset.c + +cuda_utils.o: cuda_utils.h + $(CC) $(DEFS) -c cuda_utils.c + +logistic_fn_indicator.o: logistic_fn_indicator.h + $(NVCC) $(NVCCDEFS) -c logistic_fn_indicator.cu + +mat_functions.o: mat_functions.h + $(NVCC) $(NVCCDEFS) -c mat_functions.cu + +cuda_environment.o: cuda_environment.h + $(CC) $(DEFS) -c cuda_environment.c + +linesearch.o: linesearch.h + $(CC) $(DEFS) -c linesearch.c + +conjugate_gradient.o: conjugate_gradient.h + $(CC) $(DEFS) -c conjugate_gradient.c + +newton_cg.o: newton_cg.h + $(CC) $(DEFS) -c newton_cg.c + +newton-driver.o: + $(CC) $(DEFS) -c newton-driver.c + +print_utils.o: + $(CC) $(DEFS) -c print_utils.c + +softmax_multiclass.o: softmax_multiclass.h + $(NVCC) $(NVCCDEFS) -c softmax_multiclass.cu + +sparse_dataset.o: sparse_dataset.h + $(NVCC) $(NVCCDEFS) -c sparse_dataset.cu + +gen_random.o: gen_random.h + $(NVCC) $(NVCCDEFS) -c gen_random.cu + +subsampling_helpers.o: subsampling_helpers.h + $(NVCC) $(NVCCDEFS) -c subsampling_helpers.cu + +classification_kernels.o: classification_kernels.h + $(NVCC) $(NVCCDEFS) -c classification_kernels.cu + +clean: + rm -f *.o *~ core + rm NewtonCGSolver diff --git a/code/cuda/RC-FINAL-5/classification_kernels.cu b/code/cuda/RC-FINAL-5/classification_kernels.cu new file mode 100644 index 0000000..2828321 --- /dev/null +++ b/code/cuda/RC-FINAL-5/classification_kernels.cu @@ -0,0 +1,143 @@ + +#include "classification_kernels.h" + +__device__ __inline__ double my_shfl(double x, int lane) +{ + // Split the double number into 2 32b registers. + int lo, hi; + asm volatile( "mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(x)); + + // Shuffle the two 32b registers. + lo = __shfl_xor(lo, lane); + hi = __shfl_xor(hi, lane); + + // Recreate the 64b number. + //asm volatile( "mov.b64 %0, {%1,%2};" : "=d(x)" : "r"(lo), "r"(hi)); + //return x; + return __hiloint2double( hi, lo); +} + +__device__ __inline__ double warpSum( double x ) +{ + for (int offset = WARP_SIZE/2; offset > 0; offset /= 2) + x += my_shfl( x, offset); + return x; +} + +GLOBAL void reduce(const real *input, real *results, const size_t count) { + extern __shared__ real my_results[]; + unsigned int lane = threadIdx.x >> 5; + unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; + + real sdata; + real x = 0; + + sdata = 0; + my_results[ lane ] = 0; + if(idx < count) x = input [idx]; + sdata = x; + + sdata = warpSum ( sdata ); + if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0; + else + sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0; + __syncthreads (); + + if (lane == 0) sdata = warpSum( sdata ); + if(threadIdx.x == 0) results [ blockIdx.x ] = sdata; +} + + +GLOBAL void ker_init_scaleTerms ( real *scaleTerms, int sampleSize, real *probs, int *indices ) +{ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + if (myRowId < sampleSize){ + scaleTerms[ myRowId ] = probs[ indices[ myRowId ] ] ; + } +} + + +GLOBAL void ker_compute_probs( real *probs, int rows, int sampleSize, real *randVec, real *indices) +{ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + if (myRowId < rows ){ + probs[ myRowId ] *= sampleSize; + if (probs[ myRowId ] > 1.0) probs[ myRowId ] = 1.0; + + if (randVec[ myRowId ] < probs[ myRowId ] ) + indices[ myRowId ] = 1; + else + indices[ myRowId ] = 0; + } +} + +GLOBAL void ker_compute_dHXW_nrm_log (real *dHXW, real *rowNrms, int rows) +{ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + + if (myRowId < rows) { + dHXW[ myRowId ] = abs( dHXW[ myRowId ] * (1. - dHXW[ myRowId ]) ) * rowNrms[ myRowId ]; + } +} + + +GLOBAL void ker_normalize (real *dHXW, int rows, real *nrmConstant, real *probs ){ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + if (myRowId < rows){ + probs[ myRowId ] = dHXW[ myRowId ] / nrmConstant[0]; + } +} + +GLOBAL void ker_row_norms( real *features, int rows, int cols, real *nrm ) +{ + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ); + int i = 0; + real sum = 0; + + if (myRowId < rows) { + i = myRowId; + for (int j = 0; j < cols; j += 1) + sum += pow( features[ j * rows + i ], 2.); + + nrm[ i ] = sqrt( sum ); + } +} + +GLOBAL void ker_sqr_elements ( real *ptr, int nnz, int elems_per_thread, real *results ) +{ + int myID = blockIdx.x * blockDim.x + threadIdx.x ; + int i = 0; + + if (myID < nnz) { + i = myID; + ptr[ i ] *= ptr[ i ]; + } + +} + + +GLOBAL void ker_sqrt_elements (real *ptr, int count ) +{ + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ); + int i = 0; + + if (myRowId < count ){ + i = myRowId; + ptr[ i ] = sqrt( ptr[ i ] ); + } +} + +GLOBAL void ker_init_ones (real *ptr, int count ) +{ + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ); + int i = 0; + + if (myRowId < count ){ + i = myRowId; + ptr[ i ] = 1.0; + } +} diff --git a/code/cuda/RC-FINAL-5/classification_kernels.h b/code/cuda/RC-FINAL-5/classification_kernels.h new file mode 100644 index 0000000..488fb09 --- /dev/null +++ b/code/cuda/RC-FINAL-5/classification_kernels.h @@ -0,0 +1,20 @@ +#ifndef __H_CLASSIFICATION_KERNELS__ +#define __H_CLASSIFICATION_KERNELS__ + +#include "cuda_types.h" + +__device__ __inline__ double my_shfl(double x, int lane); +__device__ __inline__ double warpSum( double x ); + +GLOBAL void reduce(const real *input, real *results, const size_t count) ; +GLOBAL void ker_init_scaleTerms ( real *scaleTerms, int sampleSize, real *probs, int *indices ); +GLOBAL void ker_compute_probs( real *probs, int rows, int sampleSize, real *randVec, real *indices); +GLOBAL void ker_compute_dHXW_nrm_log (real *dHXW, real *rowNrms, int rows); +GLOBAL void ker_normalize (real *dHXW, int rows, real *nrmConstant, real *probs ); +GLOBAL void ker_row_norms( real *features, int rows, int cols, real *nrm ); +GLOBAL void ker_sqr_elements ( real *ptr, int nnz, int elems_per_thread, real *results ); +GLOBAL void ker_sqrt_elements (real *ptr, int count ); +GLOBAL void ker_init_ones (real *ptr, int count ); + + +#endif diff --git a/code/cuda/RC-FINAL-5/conjugate_gradient.c b/code/cuda/RC-FINAL-5/conjugate_gradient.c new file mode 100644 index 0000000..982cb2b --- /dev/null +++ b/code/cuda/RC-FINAL-5/conjugate_gradient.c @@ -0,0 +1,309 @@ +#include "cuda_types.h" +#include "conjugate_gradient.h" +#include "cuda_utils.h" +#include "print_utils.h" + +#include "softmax_multiclass.h" +#include "logistic_fn_indicator.h" + +#include "float.h" +#include "time.h" +#include "stdlib.h" +#include "subsampling_helpers.h" +#include "sparse_dataset.h" + +int Cublas_CG_Logistic( DeviceDataset *data, NEWTON_CG_PARAMS *params, + real *p_gradient, real *x, real *x_best, real *rel_residual, + real *devPtr, real *hostPtr, real *pgeLckPtr) +{ + real *Hg, *residual, *p, *gradient; + real *rsold, *rsnew, *alpha, *tmp; + real *nextHostPtr, *nextDevPtr; + real gradient_norm; + real best_rel_residual; + + real *B, *probs, *scaleTerms, *rowNrms; + int *selIndices; + + int i; + + Hg = devPtr; + residual = Hg + data->cols; + p = residual + data->cols; + gradient = p + data->cols; + + B = gradient + data->cols; + probs = B + data->rows; + scaleTerms = probs + data->rows; + rowNrms = scaleTerms + data->rows; + nextDevPtr = rowNrms + data->rows; + + + rsold = pgeLckPtr; + rsnew = &pgeLckPtr[1]; + alpha = &pgeLckPtr[2]; + tmp = &pgeLckPtr[3]; + + selIndices = (int *)hostPtr; + nextHostPtr = hostPtr + data->rows; + + + //Perform the sampling for Hessian here. + if (params->hx_sampling >= 1) { + + data->hessianSampleSize = (HESSIAN_SAMPLING_SIZE * data->rows) / 100; + + prepareForSampling( &data->spHessianSample, NULL, NULL, data->rows, data->hessianSampleSize, (int *)nextHostPtr ); + data->spHessianSample.nnz = data->hessianSampleSize; + + //sample Hessian Here. + if (data->spTrain.valPtr == NULL) { + //Dense Case + convertHessianSampleToCSR( &data->spHessianSample, data->hessianSampleSize, data->cols, nextDevPtr ); + sampleDataset (&data->spHessianSample, data->trainSet, data->rows, data->cols, data->numclasses, data->sampledHessianTrainSet, data->hessianSampleSize); + } else { + //Sparse Case + convertHessianSampleToCSR( &data->spHessianSample, data->hessianSampleSize, data->cols, nextDevPtr ); + sampleSparseDataset( &data->spHessianSample, &data->spTrain, data->rows, data->cols, data->numclasses, + &data->spSampledHessianTrain, data->hessianSampleSize ); + } + + logistic_fn_indicator_hx_matvec( data->sampledHessianTrainSet, &data->spSampledHessianTrain, data->weights, x, params->lambda, data->hessianSampleSize, data->cols, Hg, nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); + } + else { + logistic_fn_indicator_hx_matvec( data->trainSet, &data->spTrain, data->weights, x, params->lambda, data->rows, data->cols, Hg, nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); + } + + *alpha = -1; + cublasCheckError (cublasDcopy( cublasHandle, data->cols, p_gradient, 1, gradient, 1) ); + cublasCheckError (cublasDscal( cublasHandle, data->cols, alpha, gradient, 1) ); + + + // residual = g - H*g; + cublasCheckError (cublasDcopy( cublasHandle, data->cols, gradient, 1, residual, 1) ); + *alpha = -1; + cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, Hg, 1, residual, 1 ) ); + + //p = residual; + cublasCheckError (cublasDcopy( cublasHandle, data->cols, residual, 1, p, 1) ); + + //rsold = Dot( residual, residual, N ); + cublasCheckError (cublasDdot( cublasHandle, data->cols, residual, 1, residual, 1, rsold ) ); + + cublasCheckError( cublasDnrm2( cublasHandle, data->cols, gradient, 1, &gradient_norm) ); + best_rel_residual = SQRT( *rsold ) / gradient_norm; + cudaMemcpy( x_best, x, data->cols * sizeof(real), cudaMemcpyDeviceToDevice ); + + for( i = 0; i < params->max_cg_iterations; ++i ) { + //hessian vec here + if (params->hx_sampling > 0) { + logistic_fn_indicator_hx_matvec( data->sampledHessianTrainSet, &data->spSampledHessianTrain, + data->weights, p, params->lambda, data->hessianSampleSize, data->cols, Hg, + nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); + } else { + logistic_fn_indicator_hx_matvec( data->trainSet, &data->spTrain, + data->weights, p, params->lambda, data->rows, data->cols, Hg, + nextDevPtr, nextHostPtr, params->hx_sampling, scaleTerms, data->rows ); + } + + //tmp = Dot( Hg, p, N ); + cublasCheckError (cublasDdot( cublasHandle, data->cols, Hg, 1, p, 1, tmp ) ); + *alpha = -1. * ((*rsold) / (*tmp)); + + //Vector_Add( residual, -alpha, Hg, N ); //residual = residual - alpha * Hg + cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, Hg, 1, residual, 1 ) ); + + *alpha *= -1.; + //Vector_Add( x, alpha, p ); x = x + alpha * p + cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, p, 1, x, 1 ) ); + + //rsnew = Dot (residual, residual); + cublasCheckError (cublasDdot( cublasHandle, data->cols, residual, 1, residual, 1, rsnew ) ); + + *rel_residual = SQRT( *rsnew ) / gradient_norm; + + if (*rel_residual < best_rel_residual) { + best_rel_residual = *rel_residual; + cudaMemcpy( x_best, x, data->cols * sizeof(real), cudaMemcpyDeviceToDevice ); + } + if (*rel_residual <= params->cg_tolerance) break; + + //p = residual + (rsnew / rsold) * p; + *alpha = (*rsnew/(*rsold)); + cublasCheckError (cublasDscal( cublasHandle, data->cols, alpha, p, 1) ); + + *alpha = 1; + cublasCheckError (cublasDaxpy( cublasHandle, data->cols, alpha, residual, 1, p, 1 ) ); + *rsold = *rsnew; + } + + *rel_residual = best_rel_residual; + + return i; +} + +int Cublas_CG_multi_optimized(SparseDataset *spfeatures, real *features, real *g, real *weights, + real *x, real *x_best, real lambda, int rows, int cols, int numclasses, real *HXW, + real *devPtr, real *hostPtr, real *pgeLckPtr, int MAX_ITERATIONS, + real tolerance, real *rel_residual, real *best_rel_residual, + SparseDataset *spSampledHessian, real *sampledHessian, + SparseDataset *spSampledHessianTrainSet, int hessianSampleSize, int samplingType ) +{ + + //CG local's Here + real *p, *r, *h, *alpha, *pAp; + real rnorm, gradient_norm, tol2, delta, bb, prev_delta; + int iter; + + //Other Locals Here + real *Hg, *B; + real *nextDevPtr, *nextHostPtr, *nextPageLckPtr; + + int *selIndices, nonUniformSampleSize, sampleSize; + real *rowNrms, *probs, *scaleTerms; + + //Device Pointers + Hg = devPtr; + r = Hg + numclasses * cols; + p = r + numclasses * cols; + B = p + numclasses * cols; + probs = B + rows * numclasses; + scaleTerms = probs + rows; + rowNrms = scaleTerms + rows; + h = rowNrms + numclasses * cols; + nextDevPtr = h + rows; + + //PageLock Pointers + alpha = &pgeLckPtr[0]; + pAp = &pgeLckPtr[1]; + nextPageLckPtr = pAp + 1; + + //Host Only Pointers + selIndices = (int *)hostPtr; + nextHostPtr = hostPtr + rows; + + + //Initializations here. + sampleSize = hessianSampleSize; + + if (samplingType >= 1) { + + if (samplingType == 1) { + sampleSize = hessianSampleSize; + prepareForSampling( spSampledHessian, NULL, NULL, rows, sampleSize, (int *)nextHostPtr ); + } else { + computeHXW( spfeatures, features, rows, cols, numclasses, weights, B, 0 ); + computeRowNorms( spfeatures, features, rows, cols, rowNrms, nextDevPtr ); + computeRowProbabilities( spfeatures, features, rows, cols, numclasses, B, rowNrms, probs, nextDevPtr ); + nonUniformSampleSize = generateNonUniformSample( probs, scaleTerms, rows, hessianSampleSize, selIndices, nextDevPtr, nextHostPtr ); + + sampleSize = nonUniformSampleSize; + prepareForNonUniformSampling( spSampledHessian, sampleSize, selIndices ); + } + spSampledHessian->nnz = sampleSize; + + //sample Hessian Here. + if (features) { + convertHessianSampleToCSR( spSampledHessian, sampleSize, cols, nextDevPtr ); + sampleDataset (spSampledHessian, features, rows, cols, numclasses, sampledHessian, sampleSize ); + } else { + convertHessianSampleToCSR( spSampledHessian, sampleSize, cols, nextDevPtr ); + sampleSparseDataset( spSampledHessian, spfeatures, rows, cols, numclasses, + spSampledHessianTrainSet, sampleSize ); + } + + softmax_multiclass_hx_subsampled(spfeatures, features, rows, cols, numclasses, + weights, x, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW, + spSampledHessian, sampledHessian, spSampledHessianTrainSet, sampleSize, scaleTerms, samplingType ); + } + else { + softmax_multiclass_hx_optimized(spfeatures, features, rows, cols, numclasses, + weights, x, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW ); + } + + + //tol2 = tol^2 + tol2 = pow( tolerance, 2. ); + + // r = g - H*g; + cublasCheckError (cublasDcopy( cublasHandle, numclasses * cols, g, 1, r, 1) ); + *alpha = -1; + cublasCheckError (cublasDaxpy( cublasHandle, numclasses * cols, alpha, Hg, 1, r, 1 ) ); + + //h = Precondition( P, r) + cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, r, 1, h, 1) ); + + //delta = r' * h + cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, r, 1, h, 1, &delta ) ); + + //bb = b' * Preconditioned( P, b) + cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, g, 1, g, 1, &bb ) ); + + //p = r; + cublasCheckError (cublasDcopy( cublasHandle, numclasses * cols, r, 1, p, 1) ); + + //Store the best result to return + *best_rel_residual = DBL_MAX; + cudaMemcpy( x_best, x, numclasses * cols * sizeof(real), cudaMemcpyDeviceToDevice ); + + iter = 0; + cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, g, 1, &gradient_norm) ); + cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, r, 1, &rnorm) ); + *rel_residual = rnorm / gradient_norm; + + while ( (delta > tol2 * bb) && (iter < MAX_ITERATIONS) && (*rel_residual > tolerance) ) { + + if (samplingType != 0) { + softmax_multiclass_hx_subsampled(spfeatures, features, rows, cols, numclasses, + weights, p, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW, + spSampledHessian, sampledHessian, spSampledHessianTrainSet, sampleSize, scaleTerms, samplingType ); + } + else { + softmax_multiclass_hx_optimized(spfeatures, features, rows, cols, numclasses, + weights, p, lambda, nextDevPtr, nextHostPtr, nextPageLckPtr, Hg, HXW ); + } + + //pAp = Dot( Hg, p, N ); + cublasCheckError (cublasDdot( cublasHandle, numclasses * cols, Hg, 1, p, 1, pAp ) ); + + //alpha = delta / pAp + *alpha = -1. * (delta / (*pAp) ); + + //r = r - alpha * Ap + cublasCheckError (cublasDaxpy( cublasHandle, numclasses * cols, alpha, Hg, 1, r, 1 ) ); + + // x = x + alpha * p + *alpha *= -1.; + cublasCheckError (cublasDaxpy( cublasHandle, numclasses * cols, alpha, p, 1, x, 1 ) ); + + // rel_res = norm(r) / norm(b) + cublasCheckError (cublasDnrm2( cublasHandle, numclasses * cols, r, 1, &rnorm) ); + *rel_residual = rnorm / gradient_norm; + + if (*rel_residual < *best_rel_residual) { + *best_rel_residual = *rel_residual; + cudaMemcpy( x_best, x, numclasses * cols * sizeof(real), cudaMemcpyDeviceToDevice ); + } + + //h = r + cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, r, 1, h, 1) ); + + prev_delta = delta; + + //delta = r' * h + cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, r, 1, h, 1, &delta) ); + + //p = h + (delta/prev_delta) * p; + *alpha = delta / prev_delta; + cublasCheckError( cublasDscal( cublasHandle, numclasses * cols, alpha, p, 1) ); + *alpha = 1; + cublasCheckError( cublasDaxpy( cublasHandle, numclasses * cols, alpha, h, 1, p, 1) ); + + //increment the iteration count here + cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, r, 1, &rnorm) ); + *rel_residual = rnorm / gradient_norm; + iter += 1; + } + + return iter; +} diff --git a/code/cuda/RC-FINAL-5/conjugate_gradient.h b/code/cuda/RC-FINAL-5/conjugate_gradient.h new file mode 100644 index 0000000..a14ad92 --- /dev/null +++ b/code/cuda/RC-FINAL-5/conjugate_gradient.h @@ -0,0 +1,16 @@ +#ifndef _H_CONJUGATE_GRADIENT__ +#define _H_CONJUGATE_GRADIENT__ + +#include +#include +#include + + +int Cublas_CG_Logistic( DeviceDataset *data, NEWTON_CG_PARAMS *params, + real *g, real *x, real *x_best, real *rel_residual, + real *devPtr, real *hostPtr, real *pgeLckPtr); +int Cublas_CG_multi_optimized (SparseDataset *, real *, real *, real *, real *, real *, real, int , int , int , + real *, real *, real *, real *, int , real, real *, real *, + SparseDataset *, real *, SparseDataset *, int, int ); +#endif + diff --git a/code/cuda/RC-FINAL-5/cuda_environment.c b/code/cuda/RC-FINAL-5/cuda_environment.c new file mode 100644 index 0000000..bad24db --- /dev/null +++ b/code/cuda/RC-FINAL-5/cuda_environment.c @@ -0,0 +1,36 @@ +#include +#include +#include + +#include + +#include +#include + +void cuda_env_init(SCRATCH_AREA *scratch, int gpu){ + //cudaSetDevice (0); + cudaSetDevice (gpu); + cudaCheckError (); + + cudaDeviceReset (); + cudaDeviceSynchronize (); + + allocate_memory( (void **)&scratch->hostWorkspace, (size_t)HOST_WORKSPACE_SIZE ); + cuda_malloc( (void **)&scratch->devWorkspace, DEVICE_WORKSPACE_SIZE, 1, ERR_MEM_ALLOC ); + cuda_malloc_host ((void **)&scratch->pageLckWorkspace, PAGE_LOCKED_WORKSPACE_SIZE, 0, ERR_MEM_ALLOC ); + + cublasCheckError( cublasCreate( &cublasHandle ) ); + cusparseCheckError( cusparseCreate( &cusparseHandle ) ); + + allocate_memory( (void **)&dscratch, (size_t)DEBUG_SCRATCH_SIZE); + + srand( time(NULL) ); +} + +void cuda_env_cleanup (SCRATCH_AREA *scratch){ + release_memory( (void **)&scratch->hostWorkspace ); + cuda_free ((void *)scratch->devWorkspace, ERR_MEM_FREE); + cuda_free_host ( (void *)scratch->pageLckWorkspace, ERR_MEM_FREE ); + + release_memory( (void **)&dscratch); +} diff --git a/code/cuda/RC-FINAL-5/cuda_environment.h b/code/cuda/RC-FINAL-5/cuda_environment.h new file mode 100644 index 0000000..65d1824 --- /dev/null +++ b/code/cuda/RC-FINAL-5/cuda_environment.h @@ -0,0 +1,9 @@ +#ifndef _H_CUDA_ENVIRONMENT__ +#define _H_CUDA_ENVIRONMENT__ + +#include "cuda_types.h" + +void cuda_env_init (SCRATCH_AREA *, int); +void cuda_env_cleanup (SCRATCH_AREA *); + +#endif diff --git a/code/cuda/RC-FINAL-5/cuda_types.h b/code/cuda/RC-FINAL-5/cuda_types.h new file mode 100644 index 0000000..c37c355 --- /dev/null +++ b/code/cuda/RC-FINAL-5/cuda_types.h @@ -0,0 +1,76 @@ +#ifndef _H_CUDA_TYPES__ +#define _H_CUDA_TYPES__ + +#include "cuda.h" +#include "cublas_v2.h" +#include "cusparse_v2.h" + +#include "cuda_runtime.h" +#include "cuda_runtime_api.h" +#include "device_launch_parameters.h" +#include "host_defines.h" + +#include +#include +#include + +#define HOST __host__ +#define DEVICE __device__ +#define GLOBAL __global__ +#define HOST_DEVICE __host__ __device__ + + +#define real double +#define SQRT sqrt + +#define HOST_WORKSPACE_SIZE ((1 * 1024 * 1024 * 1024) + (512 * 1024 * 1024)) +//#define DEVICE_WORKSPACE_SIZE ((1 * 1024 * 1024 * 1024) + (512 * 1024 * 1024)) +#define DEVICE_WORKSPACE_SIZE 1 * 1024 * 1024 * 1024 +#define PAGE_LOCKED_WORKSPACE_SIZE 1024 * 1024 + +#define DEBUG_SCRATCH_SIZE 10 * 1024 * 1024 + +#define ERROR_MEM_ALLOC 0x01 +#define ERROR_MEM_CLEANUP 0x02 +#define ERROR_MEMCPY_DEVICE_HOST 0x03 + +#define ERR_MEM_ALLOC 0x04 +#define ERR_MEM_FREE 0x05 + +#define ERROR_MEMCPY_TRAINSET 0x06 +#define ERROR_MEMCPY_TESTSET 0x07 +#define ERROR_MEMCPY_TRAINLABELS 0x08 +#define ERROR_MEMCPY_TESTLABELS 0x09 + +#define ERROR_DEBUG 0x10 +#define ERROR_MEM_SET 0x11 + +#define ERROR_MEMCPY_DEVICE_DEVICE 0x12 +#define ERROR_MEMCPY_HOST_DEVICE 0x13 + +#define CUDA_BLOCK_SIZE 1024 + +#define WARP_SIZE 32 +#define THREADS_PER_ROW 64 + + +//#define HESSIAN_SAMPLING_SIZE 1 +//#define GRADIENT_SAMPLING_SIZE 5 +//#define HESSIAN_SAMPLING_SIZE 25 +//#define GRADIENT_SAMPLING_SIZE 50 + +extern int BLOCKS, BLOCK_SIZE, BLOCKS_POW_2; +extern int HESSIAN_SAMPLING_SIZE, GRADIENT_SAMPLING_SIZE; + + +extern cublasHandle_t cublasHandle; +extern cusparseHandle_t cusparseHandle; +typedef struct scratch_space{ + real *hostWorkspace; + real *devWorkspace; + real *pageLckWorkspace; + } SCRATCH_AREA; + +extern void* dscratch; + +#endif diff --git a/code/cuda/RC-FINAL-5/cuda_utils.c b/code/cuda/RC-FINAL-5/cuda_utils.c new file mode 100644 index 0000000..baf627f --- /dev/null +++ b/code/cuda/RC-FINAL-5/cuda_utils.c @@ -0,0 +1,145 @@ + +#include "cuda_utils.h" +#include "cuda_types.h" + +void cuda_malloc (void **ptr, unsigned int size, int memset, int err_code) { + + cudaError_t retVal = cudaSuccess; + retVal = cudaMalloc (ptr, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to allocate memory on device for the res: %d... exiting with code: %d size: %d, %s \n", + err_code, retVal, size, cudaGetErrorString(retVal)); + exit (err_code); + } + + if (memset) { + retVal = cudaMemset (*ptr, 0, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to memset memory on device... exiting with code %d, %s\n", + err_code, cudaGetErrorString( retVal )); + exit (err_code); + } + } +} + +void cuda_malloc_host (void **ptr, unsigned int size, int memset, int err_code) { + + cudaError_t retVal = cudaSuccess; + retVal = cudaMallocHost (ptr, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to allocate memory on device for the res: %d... exiting with code: %d size: %d, %s \n", + err_code, retVal, size, cudaGetErrorString(retVal) ); + exit (err_code); + } + + if (memset) { + retVal = cudaMemset (*ptr, 0, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to memset memory on device... exiting with code %d, %s\n", + err_code, cudaGetErrorString( retVal )); + exit (err_code); + } + } +} + + + +void cuda_free (void *ptr, int err_code) { + + cudaError_t retVal = cudaSuccess; + if (!ptr) return; + + retVal = cudaFree (ptr); + + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld, %s\n", + err_code, retVal, (long int)ptr, cudaGetErrorString( retVal )); + return; + } +} + +void cuda_free_host (void *ptr, int err_code) { + + cudaError_t retVal = cudaSuccess; + if (!ptr) return; + + retVal = cudaFreeHost (ptr); + + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld, %s\n", + err_code, retVal, (long int)ptr, cudaGetErrorString( retVal )); + return; + } +} + + +void cuda_memset (void *ptr, int data, size_t count, int err_code){ + cudaError_t retVal = cudaSuccess; + + retVal = cudaMemset (ptr, data, count); + if (retVal != cudaSuccess) { + fprintf (stderr, "ptr passed is %ld, value: %ld \n", (long int)ptr, &ptr); + fprintf (stderr, " size to memset: %ld \n", count); + fprintf (stderr, " target data is : %d \n", data); + fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d, %s\n", + err_code, retVal, cudaGetErrorString( retVal )); + exit (err_code); + } +} + +void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid) +{ + cudaError_t retVal = cudaErrorNotReady; + + if (dir == cudaMemcpyHostToDevice) + retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice); + else + retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost); + + if (retVal != cudaSuccess) { + fprintf (stderr, "could not copy resource %d from host to device: reason %d:%s \n", + resid, retVal, cudaGetErrorString( retVal )); + exit (resid); + } +} + +void copy_device (void *dest, void *src, int size, int resid) +{ + cudaError_t retVal = cudaErrorNotReady; + + retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice); + if (retVal != cudaSuccess) { + fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", + resid, retVal); + exit (resid); + } +} + +void print_device_mem_usage () +{ + size_t total, free; + cudaMemGetInfo (&free, &total); + if (cudaGetLastError () != cudaSuccess ) + { + fprintf (stderr, "Error on the memory call \n"); + return; + } + + fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", + total, total/(1024*1024), total/ (1024*1024*1024), + free, free/(1024*1024), free/ (1024*1024*1024) ); +} + +void compute_blocks ( int *blocks, int *block_size, int count ) +{ + *block_size = CUDA_BLOCK_SIZE; + *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1); +} + +void compute_nearest_pow_2 (int blocks, int *result) +{ + int power = 1; + while (power < blocks) power *= 2; + + *result = power; +} diff --git a/code/cuda/RC-FINAL-5/cuda_utils.h b/code/cuda/RC-FINAL-5/cuda_utils.h new file mode 100644 index 0000000..e864e60 --- /dev/null +++ b/code/cuda/RC-FINAL-5/cuda_utils.h @@ -0,0 +1,88 @@ +#ifndef __CUDA_UTILS_H_ +#define __CUDA_UTILS_H_ + +#include "cuda.h" +#include "cuda_runtime_api.h" +#include "cublas_v2.h" +#include "cusparse_v2.h" +#include "stdlib.h" +#include "stdio.h" +#include "curand.h" + + +void cuda_malloc (void **, unsigned int , int , int); +void cuda_malloc_host( void **, unsigned int, int, int ); + +void cuda_free (void *, int); +void cuda_free_host (void *, int); +void cuda_memset (void *, int , size_t , int ); + +void copy_host_device (void *, void *, int , enum cudaMemcpyKind, int); +void copy_device (void *, void *, int , int ); + +void print_device_mem_usage (); + +#define cusparseCheckError(cusparseStatus) __cusparseCheckError (cusparseStatus, __FILE__, __LINE__) +inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line ) +{ +if (cusparseStatus!= CUSPARSE_STATUS_SUCCESS) +{ + //fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cusparseStatus); + fprintf (stderr, "failed .. %s:%d -- error code %d \n", file, line, cusparseStatus); + exit (-1); +} +return; +} + + +#define cublasCheckError(cublasStatus) __cublasCheckError (cublasStatus, __FILE__, __LINE__) +inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line ) +{ +if (cublasStatus!= CUBLAS_STATUS_SUCCESS) +{ + fprintf (stderr, "failed .. %s:%d -- error code %d \n", file, line, cublasStatus); + exit (-1); +} +return; +} + +#define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) +inline void __cudaCheckError( const char *file, const int line ) +{ + cudaError err = cudaGetLastError(); + if ( cudaSuccess != err ) + { + fprintf (stderr, "Failed .. %s:%d -- gpu erro code %d:%s\n", file, line, err, cudaGetErrorString( err ) ); + exit( -1 ); + } + + // More careful checking. However, this will affect performance. + // Comment away if needed. + /* + err = cudaDeviceSynchronize(); + if( cudaSuccess != err ) + { + exit( -1 ); + } + */ + return; +} + +#define curandCheckError(curandStatus) __curandCheckError (curandStatus, __FILE__, __LINE__) +inline void __curandCheckError( curandStatus_t curandStatus, const char *file, const int line ) +{ + if (curandStatus!= CURAND_STATUS_SUCCESS) + { + fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, curandStatus); + exit (-1); + } + return; +} + + + +void compute_blocks ( int *blocks, int *block_size, int count ); +void compute_nearest_pow_2 (int blocks, int *result); + + +#endif diff --git a/code/cuda/RC-FINAL-5/dataset.c b/code/cuda/RC-FINAL-5/dataset.c new file mode 100644 index 0000000..91eca1b --- /dev/null +++ b/code/cuda/RC-FINAL-5/dataset.c @@ -0,0 +1,1217 @@ +#include +#include +#include + +#include +#include +#include + +#include + +#define SAMPLING_BUFFER_EXTENSION 10 + +#define MAX_LINE 256 * 1024 +#define MAX_IDX 256 * 1024 + +#define HEAP_LINE_SIZE 4 * 1024 * 1024 + +#define CIFAR_LINE_SIZE 3073 + +void swap (real *a, real *b, real *t){ + *t = *a; + *a = *b; + *b = *t; +} + +real findMaxInDataset( real *src, int rows, int cols ) +{ + int maxval = 0; + for (int i = 0; i < rows * cols; i ++) + if (maxval < src[ i ]) maxval = src[i]; + return maxval; +} + +void preprocessDataset( real *src, int rows, int cols, real maxval) +{ + if (maxval > 0){ + for (int i = 0; i < rows * cols; i ++) + //src[i] = maxval - src[i]; + src[i] = src[i] - maxval; + } +} + +void convertToColumnMajor (real *src, int rows, int cols, real *tgt ) { + for (int i = 0; i < rows; i ++ ) + for (int j = 0; j < cols; j ++) + tgt[j * rows + i] = src[i * cols + j]; +} + +void convertRowStochastic( real *src, int rows, int cols ) { + real sum = 0; + for (int i = 0; i < rows; i ++ ) { + for (int j = 0; j < cols; j ++) + sum += src[ i * cols + j ]; + for (int j = 0; j < cols; j ++) + src[ i * cols + j ] = src[ i * cols + j] / sum; + } +} + +void convertColumnStochastic( real *src, int rows, int cols ){ + real maxval = 0; + for (int c = 0; c < cols; c ++){ + maxval = src[ c * rows ]; + for (int r = 1; r < rows; r ++){ + if (maxval < src[ c * rows + r ]) + maxval = src[ c * rows + r ]; + } + + if (maxval > 1) { + for (int r = 1; r < rows; r ++){ + src[ c * rows + r] /= maxval; + } + } + //fprintf( stderr, " Done with Column: %d, maxval: %f \n", c, maxval ); + } +} + +void columnNormalize( real *src, int rows, int cols, real *train, int tr ){ + real norm = 0; + for (int c = 0; c < cols; c ++){ + norm = pow( src[ c * rows ], 2. ); + for (int r = 1; r < rows; r ++) { + norm += pow( src[ c * rows + r ], 2. ); + } + for (int r = 0; r < tr; r ++){ + norm += pow( src[ c * tr + r ], 2. ); + } + + if (norm < 1e-8) { + norm = sqrt( norm ); + for (int r = 0; r < rows; r ++) + src[ c * rows + r ] /= norm; + + for (int r = 0; r < tr; r ++) + train[ c * tr + r ] /= norm; + } + } +} + +real computeMaxValue (real *labels, int count ) { + real maxval = 0; + for (int i = 0; i < count; i ++ ) + if (maxval < labels[i] ) + maxval = labels[i]; + + return maxval; +} + +void writeDataset( real *features, real *labels, int rows, int cols, char *filename, char *vectorname) +{ + FILE *dataset_file; + + if ( (dataset_file = fopen(filename, "w")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + for (int i = 0; i < rows; i ++){ + fprintf (dataset_file, "%4.6f", features[ i * cols ] ); + for (int j = 1; j < cols; j ++){ + fprintf( dataset_file, ",%4.6f", features[ i * cols + j ] ); + } + fprintf( dataset_file, "\n"); + } + fclose (dataset_file); + + if ( (dataset_file = fopen(vectorname, "w")) == NULL ) { + fprintf( stderr, "Error opening the labels.... !\n" ); + exit( -1 ); + } + + for (int i = 0; i < rows; i ++){ + fprintf (dataset_file, "%d\n", (int)labels[ i ] ); + } + fclose (dataset_file); +} + +void readBinaryMatFile( char *f_train_features, char *f_train_labels, + char *f_test_features, char *f_test_labels, + ForestDataset *data, SCRATCH_AREA *s, int offset) +{ + FILE *dataset_file; + char line[MAX_LINE]; + int numLines = 0; + int NUM_CLASSES = 20; + size_t output; + int idx = 0; + int i; + real cols[3]; + int max_train_col, max_test_col; + int max_train_row, max_test_row; + + char filename[MAX_LINE]; + + real *scratch = s->hostWorkspace; + + int *train_row_id, *train_col_id; + int *test_row_id, *test_col_id; + + real *train_val, *train_vec; + real *test_val, *test_vec; + + int train_nnz, test_nnz; + + int cur_column; + int *rowPtr, *colPtr; + real *valPtr, *labelPtr; + int rowNNZ; + int minCol = 1000000; + + char *heapLine = (char *)malloc (HEAP_LINE_SIZE); + + if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + max_train_row = max_train_col = 0; + train_nnz = 0; + while (!feof( dataset_file) ){ + memset( heapLine, 0, HEAP_LINE_SIZE); + + fgets( heapLine, HEAP_LINE_SIZE, dataset_file); + if (heapLine[0] == 0) break; + + cur_column = tokenize_binary_string( heapLine, 0, &train_nnz); + + if (max_train_col < cur_column) max_train_col = cur_column; + if (minCol > cur_column) minCol = cur_column; + + numLines ++; + } + max_train_row = numLines; + + fclose( dataset_file ); + fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d), NNZ: %d, %d\n", + numLines, max_train_row, max_train_col, train_nnz, minCol ); + + if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + max_test_row = max_test_col = numLines = 0; + test_nnz = 0; + minCol = 10000000; + while (!feof( dataset_file) ){ + memset( heapLine, 0, HEAP_LINE_SIZE); + + fgets( heapLine, HEAP_LINE_SIZE, dataset_file); + cur_column = tokenize_binary_string( heapLine, 0, &test_nnz); + + if (max_test_col < cur_column) max_test_col = cur_column; + if (minCol > cur_column) minCol = cur_column; + + if (heapLine[0] == 0) break; + numLines ++; + } + max_test_row = numLines; + fclose( dataset_file ); + + fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d ), NNZ: %d, %d \n", + numLines, max_test_row, max_test_col, test_nnz, minCol ); + + if (max_train_col < max_test_col ){ + fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_test_col ); + fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_test_col ); + } else { + fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_train_col ); + fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_train_col ); + } + + //Read the matrices Here. + train_row_id = (int *) malloc ( train_nnz * sizeof (int) ); + train_col_id = (int *) malloc ( train_nnz * sizeof (int) ); + train_val = (real *) malloc ( train_nnz * sizeof (real) ); + train_vec = (real *) malloc ( max_train_row * sizeof (real) ); + + if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + rowPtr = train_row_id; + colPtr = train_col_id; + valPtr = train_val; + labelPtr = train_vec; + numLines = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + cols[0] = cols[1] = cols[2] = 0; + + fgets( line, MAX_LINE, dataset_file); + if (line[0] == 0) break; + + rowNNZ = tokenize_binary_populate( line, 0, rowPtr, colPtr, valPtr, labelPtr, numLines ); + rowPtr += rowNNZ; + colPtr += rowNNZ; + valPtr += rowNNZ; + + numLines ++; + } + fclose( dataset_file ); + + for (int i = 0; i < numLines; i ++) + if (train_vec[i] == -1) train_vec[i] = 2; + + fprintf( stderr, "Done populating the training part ... \n"); + + //Read the test dataset here. + test_row_id = (int *) malloc ( test_nnz * sizeof (int) ); + test_col_id = (int *) malloc ( test_nnz * sizeof (int) ); + test_val = (real *) malloc ( test_nnz * sizeof (real) ); + test_vec = (real *) malloc ( max_test_row * sizeof (real) ); + + if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + rowPtr = test_row_id; + colPtr = test_col_id; + valPtr = test_val; + labelPtr = test_vec; + + numLines = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + cols[0] = cols[1] = cols[2] = 0; + + fgets( line, MAX_LINE, dataset_file); + rowNNZ = tokenize_binary_populate( line, 0, rowPtr, colPtr, valPtr, labelPtr, numLines); + rowPtr += rowNNZ; + colPtr += rowNNZ; + valPtr += rowNNZ; + + if (line[0] == 0) break; + + numLines ++; + } + fclose( dataset_file ); + for (int i = 0; i < numLines; i ++) + if (test_vec[i] == -1) test_vec[i] = 2; + + fprintf( stderr, "Done populating the testing part ... \n"); + + //form the cuSparseMatrix Here. + data->trainRowPtr = train_row_id; + data->trainColPtr = train_col_id; + data->trainValPtr = train_val; + data->trainLabels = train_vec; + + data->testRowPtr = test_row_id; + data->testColPtr = test_col_id; + data->testValPtr = test_val; + data->testLabels = test_vec; + + data->numclasses = 1; + data->rows = max_test_row + max_train_row; + data->trainSize = max_train_row; + data->testSize = max_test_row; + + data->trainNNZ = train_nnz; + data->testNNZ = test_nnz; + + data->trainSet = NULL; + data->testSet = NULL; + + if (max_train_col < max_test_col ) + data->cols = max_test_col; + else + data->cols = max_train_col; + + data->trainSet = NULL; + data->testSet = NULL; + + free(heapLine ); +} + +void readNewsgroupsDataset( char *f_train_features, char *f_train_labels, + char *f_test_features, char *f_test_labels, + ForestDataset *data, SCRATCH_AREA *s, int offset) +{ + FILE *dataset_file; + char line[MAX_LINE]; + int numLines = 0; + int NUM_CLASSES = 20; + size_t output; + int idx = 0; + int i; + real cols[3]; + int max_train_col, max_test_col; + int max_train_row, max_test_row; + + char filename[MAX_LINE]; + + real *scratch = s->hostWorkspace; + + int *train_row_id, *train_col_id; + int *test_row_id, *test_col_id; + + real *train_val, *train_vec; + real *test_val, *test_vec; + + int train_nnz, test_nnz; + + + if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + max_train_row = max_train_col = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + cols[0] = cols[1] = cols[2] = 0; + + fgets( line, MAX_LINE, dataset_file); + if (line[0] == 0) break; + + tokenize_string( line, cols, 0 ); + + if (max_train_row < cols[0]) max_train_row = cols[0]; + if (max_train_col < cols[1]) max_train_col = cols[1]; + + numLines ++; + } + train_nnz = numLines; + fclose( dataset_file ); + fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d) \n", + numLines, max_train_row, max_train_col ); + + + if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + max_test_row = max_test_col = numLines = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + cols[0] = cols[1] = cols[2] = 0; + + fgets( line, MAX_LINE, dataset_file); + tokenize_string( line, cols, 0 ); + + if (max_test_row < cols[0]) max_test_row = cols[0]; + if (max_test_col < cols[1]) max_test_col = cols[1]; + + if (line[0] == 0) break; + numLines ++; + } + test_nnz = numLines; + fclose( dataset_file ); + + fprintf( stderr, "Done with reading %d points from the input files ....(%d, %d) \n", + numLines, max_test_row, max_test_col ); + + if (max_train_col < max_test_col ){ + fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_test_col ); + fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_test_col ); + } else { + fprintf (stderr, "Dimensions of Train -- %d, %d \n", max_train_row, max_train_col ); + fprintf (stderr, "Dimensions of Test -- %d, %d \n", max_test_row, max_train_col ); + } + + //Read the matrices Here. + train_row_id = (int *) malloc ( train_nnz * sizeof (int) ); + train_col_id = (int *) malloc ( train_nnz * sizeof (int) ); + train_val = (real *) malloc ( train_nnz * sizeof (real) ); + train_vec = (real *) malloc ( max_train_row * sizeof (real) ); + + if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + numLines = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + cols[0] = cols[1] = cols[2] = 0; + + fgets( line, MAX_LINE, dataset_file); + if (line[0] == 0) break; + + tokenize_string( line, cols, 0 ); + + train_row_id[ numLines ] = (int)(cols[0] - 1); + train_col_id[ numLines ] = (int)(cols[1] - 1); + train_val[ numLines ] = (real)cols[2]; + + //fprintf( stderr, " %d, %d, %f \n", train_row_id[ numLines ], train_col_id[ numLines ], train_val [numLines ] ); + + numLines ++; + } + fclose( dataset_file ); + + //vector here. + i = readVector( train_vec, max_train_row, f_train_labels, offset ); + fprintf( stderr, "Labels read from file: %d, expected : %d \n", i, max_train_row ); + + //compute the NUM_CLASSES Here. + NUM_CLASSES = computeMaxValue( train_vec, max_train_row); + + //Read the test dataset here. + test_row_id = (int *) malloc ( test_nnz * sizeof (int) ); + test_col_id = (int *) malloc ( test_nnz * sizeof (int) ); + test_val = (real *) malloc ( test_nnz * sizeof (real) ); + test_vec = (real *) malloc ( max_test_row * sizeof (real) ); + + if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + numLines = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + cols[0] = cols[1] = cols[2] = 0; + + fgets( line, MAX_LINE, dataset_file); + tokenize_string( line, cols, 0 ); + + if (line[0] == 0) break; + test_row_id[ numLines ] = (int)(cols[0] - 1); + test_col_id[ numLines ] = (int)(cols[1] - 1); + test_val[ numLines ] = (real)cols[2]; + + numLines ++; + } + fclose( dataset_file ); + + //vector here. + i = readVector( test_vec, max_test_row, f_test_labels, offset ); + + //form the cuSparseMatrix Here. + data->trainRowPtr = train_row_id; + data->trainColPtr = train_col_id; + data->trainValPtr = train_val; + data->trainLabels = train_vec; + + data->testRowPtr = test_row_id; + data->testColPtr = test_col_id; + data->testValPtr = test_val; + data->testLabels = test_vec; + + data->numclasses = NUM_CLASSES - 1; + data->rows = max_test_row + max_train_row; + data->trainSize = max_train_row; + data->testSize = max_test_row; + + data->trainNNZ = train_nnz; + data->testNNZ = test_nnz; + + data->trainSet = NULL; + data->testSet = NULL; + + if (max_train_col < max_test_col ) + data->cols = max_test_col; + else + data->cols = max_train_col; + + data->trainSet = NULL; + data->testSet = NULL; + + // preprocess the dataset here. + /* + real train_max = findMaxInDataset( data->trainValPtr, data->trainNNZ, 1 ); + real test_max = findMaxInDataset( data->testValPtr, data->testNNZ, 1 ); + fprintf( stderr, "Train max: %f, Test max: %f \n", train_max, test_max ); + + if (train_max < test_max){ + preprocessDataset ( data->trainValPtr, data->trainNNZ, 1, test_max ); + preprocessDataset ( data->testValPtr, data->testNNZ, 1, test_max ); + } else { + preprocessDataset ( data->trainValPtr, data->trainNNZ, 1, train_max ); + preprocessDataset ( data->testValPtr, data->testNNZ, 1, train_max ); + } + */ +} + +void readCIFARDataset( char *dir, char *train, char *test, ForestDataset *data, SCRATCH_AREA *s, int raw) { + + FILE *dataset_file; + char line[MAX_LINE]; + int numLines = 0; + int NUM_CLASSES = 10; + size_t output; + int idx = 0; + int i; + int TRAIN_IMAGES = 50000; + int TRAIN_FILES = 5; + + char filename[MAX_LINE]; + real *train_set, *train_labels, *test_set, *test_labels; + real *scratch = s->hostWorkspace; + + train_set = (real *) malloc( (size_t)TRAIN_IMAGES * (CIFAR_LINE_SIZE-1) * sizeof(real) ); + train_labels = (real *) malloc ( (size_t)TRAIN_IMAGES * sizeof(real) ); + test_set = (real *) malloc( (size_t)10000 * (CIFAR_LINE_SIZE-1) * sizeof(real) ); + test_labels = (real *) malloc ( (size_t)10000 * sizeof(real) ); + + fprintf( stderr, " Allocated memory for the dataset : %lu \n", TRAIN_IMAGES * (CIFAR_LINE_SIZE-1) * sizeof(real)); + fprintf( stderr, " Allocated memory for the dataset (GB): %d \n", (TRAIN_IMAGES * (CIFAR_LINE_SIZE-1) * sizeof(real)) / (1024 * 1024 * 1024)); + + numLines = 0; + for (idx = 1; idx <= TRAIN_FILES; idx ++) { + sprintf( filename, "%s%s%d.bin", dir, train, idx); + fprintf( stderr, "Reading file : %s \n", filename ); + + if ( (dataset_file = fopen(filename, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + output = fread( line, (size_t)1, (size_t)CIFAR_LINE_SIZE, dataset_file); + + if (output <= 0) break; + + train_labels[ numLines ] = line[0] + 1; + for (i = 0; i < CIFAR_LINE_SIZE-1; i ++) + train_set[ numLines * (CIFAR_LINE_SIZE - 1) + i ] = (unsigned char) line[i + 1]; + + numLines ++; + } + } + fprintf( stderr, "Done with reading %d points from the input files .... \n", numLines ); + + //test data here. + numLines = 0; + memset( filename, 0, MAX_LINE ); + sprintf( filename, "%s%s", dir, test); + + if ( (dataset_file = fopen(filename, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + output = fread( line, (size_t)1, (size_t)CIFAR_LINE_SIZE, dataset_file); + if (output <= 0) break; + + test_labels[ numLines ] = line[0] + 1; + for (i = 0; i < CIFAR_LINE_SIZE - 1; i ++) + test_set[ numLines * (CIFAR_LINE_SIZE - 1) + i ] = (unsigned char) line[i + 1]; + + numLines ++; + } + fprintf( stderr, "Done with reading %d points from the input files .... \n", numLines ); + + //inititalize the device data here. + data->trainSize = TRAIN_IMAGES; + data->testSize = 10000; + data->trainSet = train_set; + data->trainLabels = train_labels; + data->testSet = test_set; + data->testLabels = test_labels; + data->rows = data->trainSize + data->testSize; + data->cols = CIFAR_LINE_SIZE - 1; + data->numclasses = NUM_CLASSES - 1; + + data->trainRowPtr = NULL; + data->trainColPtr = NULL; + data->trainValPtr = NULL; + + data->testRowPtr = NULL; + data->testColPtr = NULL; + data->testValPtr = NULL; + +/* + fprintf(stderr, "Preprocessing .... \n"); + real train_max = findMaxInDataset( train_set, data->trainSize, data->cols ); + real test_max = findMaxInDataset( test_set, data->testSize, data->cols ); + fprintf( stderr, "TrainMax %e and TestMax: %e \n", train_max, test_max ); + + if (train_max >= test_max) { + preprocessDataset( train_set, data->trainSize, data->cols, train_max ); + preprocessDataset( test_set, data->testSize, data->cols, train_max ); + } else { + preprocessDataset( train_set, data->trainSize, data->cols, test_max ); + preprocessDataset( test_set, data->testSize, data->cols, test_max ); + } +*/ + + fprintf( stderr, "Converting to column major format here.... \n"); + //train_features + convertToColumnMajor( train_set, data->trainSize, data->cols, scratch); + fprintf( stderr, "Done with conversion... \n"); + memcpy( train_set, scratch, (size_t)(sizeof(real) * data->trainSize * data->cols) ); + + //test_features + convertToColumnMajor( test_set, data->testSize, data->cols, scratch); + fprintf( stderr, "Done with conversion... \n"); + memcpy( test_set, scratch, (size_t)(sizeof(real) * data->testSize * data->cols) ); + + if (raw == 0){ + fprintf( stderr, "Normalizing the data ... "); + columnNormalize( train_set, data->trainSize, data->cols, test_set, data->testSize ); + fprintf( stderr, "Done... \n"); + } + +} + +void readMultiDataset( char *f_train_features, char *f_train_labels, + char *f_test_features, char *f_test_labels, ForestDataset *data, SCRATCH_AREA *s, int offset, int bias) +{ + FILE *dataset_file; + char line[MAX_LINE]; + int numLines = 0; + real temp[MAX_LINE]; + int NUM_CLASSES = -1; + + real *train_set, *train_labels, *test_set, *test_labels; + real *scratch = s->hostWorkspace; + + if ( (dataset_file = fopen(f_train_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, dataset_file); + + if (line[0] == 0) break; + //data->cols = tokenize_learn_multiclass( line, temp, numLines, NULL, NULL); + data->cols = tokenize_string( line, temp, bias); + numLines ++; + } + + fprintf(stderr, "Number of columns is : %d \n", data->cols ); + fprintf( stderr, "Train Size: %d \n", numLines ); + + //exit (-1); + + data->trainSize = numLines; + /* + train_set = (real *)malloc( (FEATURE_SIZE_MULTI) * data->trainSize); + train_labels = (real *)malloc(sizeof(real) * data->trainSize); + */ + train_set = (real *)malloc( data->cols * data->trainSize * sizeof(real)); + train_labels = (real *)malloc( data->trainSize * sizeof(real)); + + //read the file here and fill the matrix. + rewind( dataset_file ); + numLines = 0; + + while (!feof( dataset_file )){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, dataset_file); + if (line[0] == 0) break; + tokenize_populate( line, train_set, &numLines, data->cols, bias ); + numLines ++; + } + fclose( dataset_file ); + + //read the train labels here. + fprintf( stderr, " Reading the vector: %s \n", f_train_labels ); + readVector( train_labels, data->trainSize, f_train_labels, offset ); + + //compute the NUM_CLASSES Here. + NUM_CLASSES = computeMaxValue( train_labels, data->trainSize ); + + //read the test dataset here. + fprintf( stderr, " Reading the test Matrix: %s \n", f_test_features ); + if ( (dataset_file = fopen(f_test_features, "r")) == NULL ) { + fprintf( stderr, "Error opening the dataset.... !\n" ); + exit( -1 ); + } + + numLines = 0; + while (!feof( dataset_file) ){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, dataset_file); + + if (line[0] == 0) break; + //data->cols = tokenize_learn_multiclass( line, temp, numLines, NULL, NULL); + data->cols = tokenize_string( line, temp, bias ); + numLines ++; + } + + fprintf(stderr, "Test size: %d \n", numLines ); + fprintf( stderr, "Number of features for test set: %d \n", data->cols ); + + data->testSize = numLines; + /* + test_set = (real *)malloc( (FEATURE_SIZE_MULTI) * data->testSize); + test_labels = (real *)malloc(sizeof(real) * data->testSize); + */ + test_set = (real *)malloc( data->cols * data->testSize * sizeof(real)); + test_labels = (real *)malloc(data->testSize * sizeof(real)); + + //read the test set + rewind( dataset_file ); + numLines = 0; + + while (!feof( dataset_file )){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, dataset_file); + if (line[0] == 0) break; + tokenize_populate( line, test_set, &numLines, data->cols, bias ); + numLines ++; + } + fclose( dataset_file ); + + //read the test labels here. + readVector( test_labels, numLines, f_test_labels, offset ); + real testMax = computeMaxValue( test_labels, numLines ); + if (testMax > NUM_CLASSES) + NUM_CLASSES = (int) testMax; + + + //initialization here. + data->trainSet = train_set; + data->trainLabels = train_labels; + data->testSet = test_set; + data->testLabels = test_labels; + data->rows = data->trainSize + data->testSize; + data->numclasses = NUM_CLASSES - 1; + + data->trainRowPtr = NULL; + data->trainColPtr = NULL; + data->trainValPtr = NULL; + + data->testRowPtr = NULL; + data->testColPtr = NULL; + data->testValPtr = NULL; + + //preprocessing step here. + /* + real train_max = findMaxInDataset( train_set, data->trainSize, data->cols ); + real test_max = findMaxInDataset( test_set, data->testSize, data->cols ); + + if (train_max >= test_max) { + preprocessDataset( train_set, data->trainSize, data->cols, train_max ); + preprocessDataset( test_set, data->testSize, data->cols, train_max ); + } else { + preprocessDataset( train_set, data->trainSize, data->cols, test_max ); + preprocessDataset( test_set, data->testSize, data->cols, test_max ); + } + */ + + //train_features + convertToColumnMajor( train_set, data->trainSize, data->cols, scratch); + memcpy( train_set, scratch, sizeof(real) * data->trainSize * data->cols ); + + //test_features + convertToColumnMajor( test_set, data->testSize, data->cols, scratch); + memcpy( test_set, scratch, sizeof(real) * data->testSize * data->cols ); + + //DEBUG HERE. + /* + fprintf (stderr, "Train Set Here \n"); + for (int i = 0; i < data->trainSize; i ++) + fprintf( stderr, " %2.2f ", train_set[ i ] ); + fprintf( stderr, "\n"); + + fprintf( stderr, "Labels here \n"); + for (int i = 0; i < data->trainSize; i ++) + fprintf( stderr, " %2.2f ", train_labels[ i ] ); + fprintf (stderr, "\n"); + */ +} + +int tokenize_binary_populate( char *line, int bias, int *row, int *col, real *val, real *label, int rowNum ) +{ + const char *sep = ", \n"; + char *word, *ptr; + char temp[MAX_LINE]; + int index = 0; + int len = 0; + + char col_str[32]; + + if (bias >= 1){ + *row = rowNum; row ++; + *col = 0; col ++; + *val = 1; val ++; + + index = 1; + } + + strncpy( temp, line, MAX_LINE ); + for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) + { + memset( col_str, 0, sizeof(char) * 32); + memcpy( col_str, word, 31); + len = 0; + + ptr = col_str; + while (*ptr != 0 && *ptr != ':'){ + ptr ++; + len ++; + } + + if (*ptr == ':') { + *ptr = 0; + + *row = rowNum; row ++; + *col = atoi( col_str) - 1; col ++; + *val = atof( col_str + len + 1); val ++; + index ++; + + } else { + label[rowNum] = atof( word ); + } + } + + return index; +} + +int tokenize_binary_string( char *line, int bias, int *nnz) +{ + const char *sep = ", \n"; + char *word, *ptr; + //char temp[MAX_LINE]; + int index = 0; + int col = 0; + real val = 0; + int len = 0; + + char col_str[32]; + + if (bias >= 1) index = 1; + + for( word = strtok(line, sep); word; word = strtok(NULL, sep) ) + { + col = val = -99; + memset( col_str, 0, 32); + + strncpy( col_str, word, 31 ); + ptr = col_str; + + len = 0; + while (*ptr != 0 && *ptr != ':'){ + ptr ++; + len ++; + } + + if (*ptr == ':') { + *ptr = 0; + col = atoi( col_str ); // to account for zero here. + val = atof( col_str + len + 1 ); + + (*nnz) ++; + } + } + + return col; +} + + +int tokenize_string( char *line, real *out, int bias ) +{ + const char *sep = ", \n"; + char *word; + char temp[MAX_LINE]; + int index = 0; + + if (bias >= 1) index = 1; + + strncpy( temp, line, MAX_LINE ); + for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) out[ index ++] = atof( word ); + + return index; +} + +void tokenize_populate(char *line, real *train_set, int *count, int size, int bias){ + + const char *sep = ", \n"; + char *word; + char temp[MAX_LINE]; + int index = 0; + real cur_row[MAX_LINE]; + + if (bias >= 1) cur_row[ index ++ ] = 1; + + strncpy( temp, line, MAX_LINE ); + for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) cur_row[ index ++] = atof( word ); + memcpy( &train_set[ (*count) * (size)], cur_row, sizeof(real) * size); +} + + +void printDataset( ForestDataset *t) +{ + fprintf( stderr, "--------------------"); + fprintf( stderr, "Train Row 1: "); + for (int i = 0; i < 52; i ++) + fprintf( stderr, " %f ", t->trainSet[i] ); + fprintf( stderr, "\n"); + fprintf( stderr, "Test Row 1: "); + for (int i = 0; i < 52; i ++) + fprintf( stderr, " %f ", t->testSet[i] ); + fprintf( stderr, "\n"); + + fprintf( stderr, "Train Labels: \n"); + for (int i = 0; i < t->rows; i ++) + fprintf (stderr, " %f ", t->trainLabels[i] ); + fprintf( stderr, "\n"); + + fprintf( stderr, "Test Labels: \n"); + for (int i = 0; i < 200; i ++) + fprintf (stderr, " %f ", t->testLabels[i] ); + fprintf( stderr, "\n"); + fprintf( stderr, "--------------------\n"); +} + +// +// +// Device Functions here. +// +// +void initialize_device_data( ForestDataset *s, DeviceDataset *t) +{ + t->rows = s->trainSize; + t->cols = s->cols; + t->testSize = s->testSize; + t->numclasses = s->numclasses; + + cuda_malloc( (void **)&t->trainSet, t->rows * t->cols * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->trainSet, t->trainSet, t->rows * t->cols * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **)&t->trainLabels, t->rows * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->trainLabels, t->trainLabels, t->rows * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINLABELS ); + + cuda_malloc( (void **)&t->testSet, t->testSize * t->cols * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->testSet, t->testSet, t->testSize * t->cols * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TESTSET ); + + cuda_malloc( (void **)&t->testLabels, t->testSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->testLabels, t->testLabels, t->testSize * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TESTLABELS ); + + if (t->numclasses > 1) + cuda_malloc( (void **)&t->weights, t->numclasses * t->cols * sizeof(real), 1, ERROR_MEM_ALLOC ); + else + cuda_malloc( (void **)&t->weights, t->cols * sizeof(real), 1, ERROR_MEM_ALLOC ); + +#ifdef __debug__ + fprintf (stderr, " -------------- \n"); + fprintf( stderr, "Train Set size: %d %d, %d \n", t->rows, t->cols, t->testSize ); + fprintf (stderr, " -------------- \n"); +#endif + + t->spTrain.rowPtr = NULL; + t->spTrain.colPtr = NULL; + t->spTrain.valPtr = NULL; + t->spTrain.rowCsrPtr = NULL; + + t->spTest.rowPtr = NULL; + t->spTest.colPtr = NULL; + t->spTest.valPtr = NULL; + t->spTest.rowCsrPtr = NULL; + + //printVector (t->testSet, t->testSize, NULL); + //printVector( t->trainLabels, t->rows, s->trainLabels ); + + + //sub sampling here. + //Hesian part here + t->spSampledHessianTrain.nnz = 0; + t->spSampledHessianTrain.P = NULL; + t->spSampledHessianTrain.sortedVals= NULL; + t->spSampledHessianTrain.rowPtr= NULL; + t->spSampledHessianTrain.colPtr= NULL; + t->spSampledHessianTrain.valPtr= NULL; + t->spSampledHessianTrain.rowCsrPtr= NULL; + + t->hessianSampleSize = (SAMPLING_BUFFER_EXTENSION * HESSIAN_SAMPLING_SIZE * t->rows) / 100; + t->spHessianSample.nnz = t->hessianSampleSize; + cuda_malloc( (void **)&t->sampledHessianTrainSet, t->hessianSampleSize* t->cols * sizeof(real), 0, ERROR_MEM_ALLOC ); + //fprintf( stderr, "SubSampled Size for this dataset (Hessian): %d \n", t->hessianSampleSize); + + //spSampledHessian + cuda_malloc( (void **) &t->spHessianSample.P, t->hessianSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.sortedVals, t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.rowPtr, t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.colPtr, t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.valPtr, t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.rowCsrPtr, (t->hessianSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + + //Gradient Sample Here. + t->spSampledGradientTrain.nnz = 0; + t->spSampledGradientTrain.P = NULL; + t->spSampledGradientTrain.sortedVals = NULL; + t->spSampledGradientTrain.rowPtr = NULL; + t->spSampledGradientTrain.colPtr = NULL; + t->spSampledGradientTrain.valPtr = NULL; + t->spSampledGradientTrain.rowCsrPtr = NULL; + + t->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * t->rows ) / 100; + t->spGradientSample.nnz = t->gradientSampleSize; + cuda_malloc( (void **)&t->sampledGradientTrainSet, t->gradientSampleSize* t->cols * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **)&t->sampledGradientTrainLabels, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + fprintf( stderr, "SubSampled Size for this dataset (Gradient): %d \n", t->gradientSampleSize); + + //spSampledHessian + cuda_malloc( (void **) &t->spGradientSample.P, t->gradientSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.sortedVals, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.rowPtr, t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.colPtr, t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.valPtr, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.rowCsrPtr, (t->gradientSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + + +} + +void initialize_device_data_sparse( ForestDataset *s, DeviceDataset *t ) +{ + + t->trainSet = NULL; + t->testSet = NULL; + + t->rows = s->trainSize; + t->cols = s->cols; + t->testSize = s->testSize; + t->numclasses = s->numclasses; + + //t->trainNNZ = s->trainNNZ; + //t->testNNZ = s->testNNZ; + t->spTrain.nnz = s->trainNNZ; + t->spTest.nnz = s->testNNZ; + + fprintf( stderr, "NNZ: %d, %d \n", s->trainNNZ, s->testNNZ ); + + //Train Set Here. + cuda_malloc( (void **) &t->spTrain.rowPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->trainRowPtr, t->spTrain.rowPtr, s->trainNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->spTrain.colPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->trainColPtr, t->spTrain.colPtr, s->trainNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->spTrain.valPtr, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->trainValPtr, t->spTrain.valPtr, s->trainNNZ * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->trainLabels, t->rows * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->trainLabels, t->trainLabels, t->rows* sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->spTrain.rowCsrPtr, (t->rows + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + + //allocate the csc format matrix space here. + //cuda_malloc( (void **) &t->spTrain.cscRowPtr, s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); + //cuda_malloc( (void **) &t->spTrain.cscColPtr, (t->cols + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + //cuda_malloc( (void **) &t->spTrain.cscValPtr, s->trainNNZ * sizeof(double), 1, ERROR_MEM_ALLOC ); + + //allocate the data for sorted coo format here. + cuda_malloc( (void **) &t->spTrain.P, s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spTrain.sortedVals, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + + + + fprintf( stderr, "Done with training .... \n"); + + //TestSet Here. + cuda_malloc( (void **) &t->spTest.rowPtr, s->testNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->testRowPtr, t->spTest.rowPtr, s->testNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->spTest.colPtr, s->testNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->testColPtr, t->spTest.colPtr, s->testNNZ * sizeof(int), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->spTest.valPtr, s->testNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->testValPtr, t->spTest.valPtr, s->testNNZ * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->testLabels, t->testSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + copy_host_device( s->testLabels, t->testLabels, t->testSize * sizeof(real), cudaMemcpyHostToDevice, ERROR_MEMCPY_TRAINSET ); + + cuda_malloc( (void **) &t->spTest.rowCsrPtr, (t->testSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + + //allocate the data for sorted coo format here. + cuda_malloc( (void **) &t->spTest.P, s->testNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spTest.sortedVals, s->testNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + + fprintf( stderr, "Done with testing .... \n"); + + //Weights Here. + if (t->numclasses > 1) + cuda_malloc( (void **)&t->weights, t->numclasses * t->cols * sizeof(real), 1, ERROR_MEM_ALLOC ); + else + cuda_malloc( (void **)&t->weights, t->cols * sizeof(real), 1, ERROR_MEM_ALLOC ); + + //sparse sample matrices here. + //sub sampling here. + //Hesian part here + t->sampledGradientTrainSet = NULL; + t->sampledHessianTrainSet = NULL; + t->hessianSampleSize = (SAMPLING_BUFFER_EXTENSION * HESSIAN_SAMPLING_SIZE * t->rows) / 100; + t->spHessianSample.nnz = t->hessianSampleSize; + + //spSampledHessian + cuda_malloc( (void **) &t->spHessianSample.P, t->hessianSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.sortedVals, t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.rowPtr, t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.colPtr, t->hessianSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.valPtr, t->hessianSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spHessianSample.rowCsrPtr, (t->hessianSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + + //Gradient Sample Here. + t->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * t->rows ) / 100; + t->spGradientSample.nnz = t->gradientSampleSize; + cuda_malloc( (void **)&t->sampledGradientTrainLabels, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + + //spSampledHessian + cuda_malloc( (void **) &t->spGradientSample.P, t->gradientSampleSize * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.sortedVals, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.rowPtr, t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.colPtr, t->gradientSampleSize * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.valPtr, t->gradientSampleSize * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spGradientSample.rowCsrPtr, (t->gradientSampleSize + 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + + + cuda_malloc( (void **) &t->spSampledGradientTrain.rowPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledGradientTrain.colPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledGradientTrain.valPtr, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledGradientTrain.rowCsrPtr, (t->rows+ 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledGradientTrain.P, s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledGradientTrain.sortedVals, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + + cuda_malloc( (void **) &t->spSampledHessianTrain.rowPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledHessianTrain.colPtr, s->trainNNZ * sizeof(int), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledHessianTrain.valPtr, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledHessianTrain.rowCsrPtr, (t->rows+ 1) * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledHessianTrain.P, s->trainNNZ * sizeof(int), 1, ERROR_MEM_ALLOC ); + cuda_malloc( (void **) &t->spSampledHessianTrain.sortedVals, s->trainNNZ * sizeof(real), 0, ERROR_MEM_ALLOC ); + + + //Debug print statements here. + fprintf (stderr, " -------------- \n"); + fprintf( stderr, "Train Set size: %d %d, %d \n", t->rows, t->cols, t->testSize ); + fprintf (stderr, " -------------- \n"); +} + +void cleanup_dataset( ForestDataset *s, DeviceDataset *t){ + if (s->trainSet) release_memory( (void **)&s->trainSet ); + if (s->trainLabels ) release_memory( (void **)&s->trainLabels ); + if (s->testSet ) release_memory( (void **)&s->testSet ); + if (s->testLabels ) release_memory( (void **)&s->testLabels ); + + if (t->trainSet) cuda_free ( t->trainSet, ERROR_MEM_CLEANUP ); + if (t->trainLabels ) cuda_free ( t->trainLabels, ERROR_MEM_CLEANUP ); + if (t->testSet) cuda_free ( t->testSet, ERROR_MEM_CLEANUP ); + if (t->testLabels) cuda_free ( t->testLabels, ERROR_MEM_CLEANUP ); + + //sparse functions here. + if (t->spTrain.rowPtr || t->spTrain.colPtr || t->spTrain.valPtr) + cusparseDestroyMatDescr( t->spTrain.descr ); + if (t->spTrain.rowPtr) cuda_free( t->spTrain.rowPtr, ERROR_MEM_CLEANUP ); + if (t->spTrain.colPtr) cuda_free( t->spTrain.colPtr, ERROR_MEM_CLEANUP ); + if (t->spTrain.valPtr) cuda_free( t->spTrain.valPtr, ERROR_MEM_CLEANUP ); + if (t->spTrain.rowCsrPtr) cuda_free( t->spTrain.rowCsrPtr, ERROR_MEM_CLEANUP ); + + if (t->spTest.rowPtr || t->spTest.colPtr || t->spTest.valPtr) + cusparseDestroyMatDescr( t->spTest.descr ); + if (t->spTest.rowPtr) cuda_free( t->spTest.rowPtr, ERROR_MEM_CLEANUP ); + if (t->spTest.colPtr) cuda_free( t->spTest.colPtr, ERROR_MEM_CLEANUP ); + if (t->spTest.valPtr) cuda_free( t->spTest.valPtr, ERROR_MEM_CLEANUP ); + if (t->spTest.rowCsrPtr) cuda_free( t->spTest.rowCsrPtr, ERROR_MEM_CLEANUP ); +} diff --git a/code/cuda/RC-FINAL-5/dataset.h b/code/cuda/RC-FINAL-5/dataset.h new file mode 100644 index 0000000..d22193d --- /dev/null +++ b/code/cuda/RC-FINAL-5/dataset.h @@ -0,0 +1,106 @@ +#ifndef _H_DATASET__ +#define _H_DATASET__ + +#include + +typedef struct dataset{ + real *trainSet; + real *trainLabels; + real *testSet; + real *testLabels; + int trainSize; + int testSize; + + int rows; + int cols; + int numclasses; + + int *trainRowPtr, *trainColPtr, *testRowPtr, *testColPtr; + real *trainValPtr, *testValPtr; + int trainNNZ, testNNZ; +} ForestDataset; + +typedef struct spData { + int *rowPtr, *colPtr, *rowCsrPtr; + real *valPtr; + + int nnz; + + //int *cscRowPtr, *cscColPtr; + //real *cscValPtr; + + real *sortedVals; + int *P; + + cusparseMatDescr_t descr; + +} SparseDataset; + +typedef struct devDataSet{ + real *trainSet; + real *trainLabels; + real *testSet; + real *testLabels; + + real *weights; + int rows; + int cols; + + int testSize; + + int numclasses; + + SparseDataset spTrain; + SparseDataset spTest; + + //subsampling part here. + real *sampledGradientTrainSet; + real *sampledGradientTrainLabels; + int gradientSampleSize; + SparseDataset spGradientSample; + + real *sampledHessianTrainSet; + int hessianSampleSize; + SparseDataset spHessianSample; + + SparseDataset spSampledGradientTrain; + SparseDataset spSampledHessianTrain; + +}DeviceDataset; + +typedef struct params{ + real *sigma; + real *mu; +}GAUSSIAN_PARAMS; + +void printDataset( ForestDataset *t ); + + +void readMultiDataset( char *f_train_features, char *f_train_labels, + char *f_test_features, char *f_test_labels, ForestDataset *data, SCRATCH_AREA *s, int offset, int bias); +void readCIFARDataset( char *dir, char *train, char *test, ForestDataset *data, SCRATCH_AREA *s, int); +void readNewsgroupsDataset( char *train_features, char *train_labels, + char *test_features, char *test_labels, + ForestDataset *data, SCRATCH_AREA *s, int offset); +void readBinaryMatFile( char *train_features, char *train_labels, + char *test_features, char *test_labels, + ForestDataset *data, SCRATCH_AREA *s, int offset); + +//int tokenize_learn_multiclass( char *line, real* t, int curIndex, int *counters, int **idx); +int tokenize_string( char *line, real *out, int bias ); +void tokenize_populate(char *line, real *train_set, int *count, int size, int bias); + +int tokenize_binary_string( char *line, int bias, int *nnz); +int tokenize_binary_populate( char *line, int bias, int *row, int *col, real *val, real *label, int rowNum ); + + +void initialize_device_data( ForestDataset *s, DeviceDataset *t); +void initialize_device_data_sparse( ForestDataset *s, DeviceDataset *t ); +void cleanup_dataset( ForestDataset *s, DeviceDataset *t); + +real findMaxInDataset( real *src, int rows, int cols ); +void preprocessDataset( real *src, int rows, int cols, real maxval); + + + +#endif diff --git a/code/cuda/RC-FINAL-5/gen_random.cu b/code/cuda/RC-FINAL-5/gen_random.cu new file mode 100644 index 0000000..6a3a6bd --- /dev/null +++ b/code/cuda/RC-FINAL-5/gen_random.cu @@ -0,0 +1,75 @@ +#include "gen_random.h" + +#include "cuda_types.h" +#include "cuda_utils.h" + +#include "time.h" + +void getRandomVector (int n, real *hostPtr, real *devPtr) { + + curandGenerator_t gen ; + int m = n + n % 2; + + /* Create pseudo - random number generator */ + curandCheckError ( curandCreateGenerator (&gen , CURAND_RNG_PSEUDO_DEFAULT ) ); + + /* Set seed */ + //curandCheckError ( curandSetPseudoRandomGeneratorSeed ( gen , 1234ULL )) ; + curandCheckError ( curandSetPseudoRandomGeneratorSeed ( gen , time(NULL) )) ; + + /* Generate n floats on device */ + //curandCheckError ( curandGenerateNormalDouble ( gen , devPtr , m, 0, 1.)) ; + curandCheckError ( curandGenerateUniformDouble ( gen , devPtr , m)) ; + + /* Copy device memory to host */ + //copy_host_device( hostPtr, devPtr, sizeof(real) * n, cudaMemcpyDeviceToHost, + // ERROR_MEMCPY_DEVICE_HOST ); + /* Cleanup */ + curandCheckError ( curandDestroyGenerator ( gen ) ); +} + +/* +Random Shuffle Here. +https://stackoverflow.com/questions/15961119/how-to-create-a-random-permutation-of-an-array +*/ +void randomShuffle( int *idx, int n) +{ + int j, temp; + for (int i = n - 1; i >= 0; i --){ + j = rand () % (i+1); + + temp = idx[i]; + idx[i] = idx[j]; + idx[j] = temp; + } +} + + +/* +Floyd's algorithm Here. +https://stackoverflow.com/questions/1608181/unique-random-numbers-in-an-integer-array-in-the-c-programming-language +*/ + +void genRandomVector( int *idx, int m, int n ) { + + int in, im; + int rn, rm; + im = 0; + + for (in = 0; in < n && im < m; ++in ){ + rn = n - in; + rm = m - im; + + if (rand () % rn < rm ){ + idx[ im ++] = in + 1; + } + } + + if ( im != m ){ + fprintf( stderr, "Failed to generate required number of random numbers ... "); + exit (-1); + } + + randomShuffle( idx, m ); +} + diff --git a/code/cuda/RC-FINAL-5/gen_random.h b/code/cuda/RC-FINAL-5/gen_random.h new file mode 100644 index 0000000..6182652 --- /dev/null +++ b/code/cuda/RC-FINAL-5/gen_random.h @@ -0,0 +1,12 @@ +#ifndef __H_GEN_RANDOM__ +#define __H_GEN_RANDOM__ + +#include "cuda_types.h" + +void getRandomVector (int n, real *hostPtr, real *devPtr); + +void randomShuffle( int *idx, int m ); +void genRandomVector( int *idx, int m, int n ); + + +#endif diff --git a/code/cuda/RC-FINAL-5/linesearch.c b/code/cuda/RC-FINAL-5/linesearch.c new file mode 100644 index 0000000..a5e7835 --- /dev/null +++ b/code/cuda/RC-FINAL-5/linesearch.c @@ -0,0 +1,69 @@ +#include "linesearch.h" +#include "logistic_fn_indicator.h" +#include "cuda_utils.h" +#include "print_utils.h" + +#include "softmax_multiclass.h" + +real cg_linesearch (real *d, real *weights, real rho, real c, SparseDataset *spfeatures, real *features, real *target, + real lambda, int rows, int cols, int numclasses, real *gk, real *xx, real *devPtr, real *hostPtr, real *pageLocked) +{ + real alphak = 1.; + real temp; + real *fk = &pageLocked[0]; + real *fk1 = &pageLocked[1]; + real *nextPagePtr = pageLocked + 2; + + real *x = devPtr; + real *nextDevPtr = x + numclasses * cols; + int iterations = 0; + + cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, weights, 1, x, 1) ); + + /* + if (numclasses == 1) + logistic_fn_indicator( features, spfeatures, target, x, lambda, rows, cols, fk, nextDevPtr, hostPtr); + else + */ + *fk = softmax_multiclass_fx (spfeatures, features, target, rows, cols, numclasses, x, + lambda, nextDevPtr, hostPtr, nextPagePtr); +//fprintf (stderr, "%e, %d, %d, %d\n", *fk, rows, cols, numclasses ); + + //xx = x; + cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, x, 1, xx, 1) ); + + //x = x + alphak*d + cublasCheckError( cublasDaxpy( cublasHandle, numclasses * cols, &alphak, d, 1, x, 1) ); + + cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, d, 1, &temp )) ; + + /* + if (numclasses == 1) + logistic_fn_indicator( features, spfeatures, target, x, lambda, rows, cols, fk1, nextDevPtr, hostPtr); + else + */ + *fk1 = softmax_multiclass_fx (spfeatures, features, target, rows, cols, numclasses, x, + lambda, nextDevPtr, hostPtr, nextPagePtr); +//fprintf (stderr, "%e, %d, %d, %d\n", *fk1, rows, cols, numclasses ); + + cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, gk, 1, d, 1, &temp) ); + while (((*fk1) > ((*fk) + c * alphak * temp)) && (iterations < 50)){ + alphak *= rho; + + cublasCheckError( cublasDcopy( cublasHandle, numclasses * cols, xx, 1, x, 1) ); + cublasCheckError( cublasDaxpy( cublasHandle, numclasses * cols, &alphak, d, 1, x, 1) ); + + /* + if (numclasses == 1) + logistic_fn_indicator( features, spfeatures, target, x, lambda, rows, cols, fk1, nextDevPtr, hostPtr); + else + */ + *fk1 = softmax_multiclass_fx (spfeatures, features, target, rows, cols, numclasses, x, + lambda, nextDevPtr, hostPtr, nextPagePtr); + + iterations ++; +//fprintf (stderr, "%e, %d, %d, %d\n", *fk1, rows, cols, numclasses ); + } + //fprintf( stderr, "..... line search iterations.... %d ( %2.6e, %2.6e) \n", iterations, *fk1, (*fk + c * alphak * temp) ); + return alphak; +} diff --git a/code/cuda/RC-FINAL-5/linesearch.h b/code/cuda/RC-FINAL-5/linesearch.h new file mode 100644 index 0000000..7c387cf --- /dev/null +++ b/code/cuda/RC-FINAL-5/linesearch.h @@ -0,0 +1,10 @@ +#ifndef __H_LINESEARCH__ +#define __H_LINESEARCH__ + +#include +#include + +real cg_linesearch (real *, real *, real , real , SparseDataset *, real *, real *, + real , int , int , int, real *, real *, real *, real *, real * ); + +#endif diff --git a/code/cuda/RC-FINAL-5/logistic-driver.c b/code/cuda/RC-FINAL-5/logistic-driver.c new file mode 100644 index 0000000..2490851 --- /dev/null +++ b/code/cuda/RC-FINAL-5/logistic-driver.c @@ -0,0 +1,281 @@ +#include +#include + +#include "dataset.h" +#include "sparse_dataset.h" + +#include "cuda_environment.h" +#include "newton_cg.h" +#include "utils.h" +#include "cuda_utils.h" +#include "logistic_fn_indicator.h" + +#include "softmax_multiclass.h" + +cublasHandle_t cublasHandle; +cusparseHandle_t cusparseHandle; +int BLOCKS, BLOCK_SIZE, BLOCKS_POW_2; +void *dscratch; + +int main(int argc, char **argv){ + + // Data variables. + ForestDataset forestData; + DeviceDataset devData; + SCRATCH_AREA scratch; + NEWTON_CG_PARAMS params; + + real trainingTime_s, classificationTime_s; + real trainingTime_t, classificationTime_t; + int test_case_no = 1; + int nConIterations; + + int DATASET_TYPE = 1; + double l = 1e-6; + int max_cg_iterations = -1; + double cg_tolerance = 0; + + if (argc <= 4) { + fprintf( stderr, " dataset lambda .... is the commnad \n"); + exit (-1); + } + + DATASET_TYPE = atoi( argv[1] ); + l = atof ( argv[2] ); + max_cg_iterations = atoi (argv[3] ); + cg_tolerance = atof( argv[4] ); + + // Create the CUDA Environment Here. + // Memory and device settings here. + cuda_env_init (&scratch); + #ifdef __debug__ + fprintf( stderr, "Scratch Area initialized ... \n"); + #endif + + + switch( DATASET_TYPE ) { + + case 1: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_features.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_labels.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_features.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_labels.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 11: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_features.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_labels.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_features.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_labels.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 2: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_vec.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 12: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_vec.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 3: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/mnist/test_vec.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 13: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/mnist/test_vec.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 4: + readCIFARDataset( + "/mnt/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/", + "data_batch_", "test_batch.bin", + &forestData, &scratch, 1 ); + break; + + case 14: + readCIFARDataset( + "/mnt/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/", + "data_batch_", "test_batch.bin", + &forestData, &scratch, 0 ); + break; + + case 5: + readNewsgroupsDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/newsgroups/test_vec.txt", + &forestData, &scratch ); + break; + + case 15: + readNewsgroupsDataset ( + "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_vec.txt", + &forestData, &scratch ); + break; + + //Logistic Datasets Here + case 6: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/mushrooms/test_vec.txt", + &forestData, &scratch, 1, 1 ); + break; + + case 7: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_vec.txt", + &forestData, &scratch, 1, 1 ); + break; + case 8: + readMultiDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/gisette/test_vec.txt", + &forestData, &scratch, 1, 1 ); + break; + + //Sparse Logistic Datasets Here + case 9: + readNewsgroupsDataset( + "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/rcv1/test_vec.txt", + &forestData, &scratch ); + break; + case 10: + readNewsgroupsDataset ( + "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/train_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/train_vec.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/test_mat.txt", + "/mnt/home/skylasa/solvers/dataset/raw-data/real-sim/test_vec.txt", + &forestData, &scratch ); + break; + + } + #ifdef __debug__ + fprintf( stderr, "Done with initialization of the dataset .... \n"); + fprintf( stderr, "Blocks for %d data points... \n", forestData.rows); + #endif + + compute_blocks (&BLOCKS, &BLOCK_SIZE, forestData.trainSize); + compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2); + if (BLOCKS_POW_2 < 32) BLOCKS_POW_2 = 32; + #ifdef __debug__ + fprintf ( stderr, "Blocks: %d, BlockSize: %d, Power_2: %d\n", BLOCKS, BLOCK_SIZE, BLOCKS_POW_2); + #endif + + + // Move the data to the Device. + if ((forestData.trainSet == NULL) && (forestData.testSet == NULL)) + { + initialize_device_data_sparse( &forestData, &devData ); + initMatDescriptors ( &devData ); + convertToCSR ( &devData, scratch.devWorkspace ); + + initMatDescriptorsForSampling( &devData ); + initMatDescriptorsForSparseSampling( &devData ); + } else { + initialize_device_data( &forestData, &devData ); + initMatDescriptorsForSampling( &devData ); + } + + #ifdef __debug__ + fprintf( stderr, "Inittialized the Device with the dataset ... \n"); + #endif + + //Train the dataset here. + params.max_iterations = 10; + params.tolerance = 1e-5; + params.iflag = 0; + + params.lambda = l; + params.max_cg_iterations = max_cg_iterations; + params.cg_tolerance = cg_tolerance; + + params.gx_sampling = 0; + params.hx_sampling = 0; + + fprintf( stderr, "Start of TestCase: %d\n", test_case_no); + trainingTime_s = Get_Time (); + nConIterations = newton_cg_multi_optimized( &forestData, &devData, ¶ms, &scratch, params.gx_sampling); + trainingTime_t = Get_Timing_Info( trainingTime_s ); + #ifdef __debug__ + fprintf( stderr, "Done with training .... \n"); + #endif + + //exit (-1); + + //Predict the testing set here. + real accuracy = 0; + classificationTime_s = Get_Time (); + accuracy = softmax_predict(&devData.spTest, devData.testSet, forestData.testLabels, devData.weights, devData.testSize, + devData.cols, devData.numclasses, scratch.hostWorkspace, scratch.devWorkspace, + 1, forestData.testSet); + classificationTime_t = Get_Timing_Info( classificationTime_s ); + //fprintf( stderr, "Start of TestCase: %d\n", test_case_no); + fprintf( stderr, "Dataset: %d \n", DATASET_TYPE ); + //fprintf( stderr, "Column Selected : %d\n", col ); + fprintf( stderr, "NumClasses: %d\n", devData.numclasses ); + fprintf( stderr, "Lambda: %e\n", params.lambda ); + fprintf( stderr, "NewtonIterations: %d\n", params.max_iterations); + fprintf( stderr, "NewtonTolerance: %e\n", params.tolerance); + fprintf( stderr, "CGIterations: %d\n", params.max_cg_iterations); + fprintf( stderr, "CGTolerance: %e\n", params.cg_tolerance ); + fprintf( stderr, "DataSetSize: %d\n", forestData.rows ); + //fprintf( stderr, "TrainingPer: %3.2f\n", d * 100.); + fprintf( stderr, "TrainingSize: %d\n", forestData.trainSize); + fprintf( stderr, "Features: %d\n", forestData.cols ); + fprintf( stderr, "TrainingTime: %d\n", (unsigned int)(trainingTime_t * 1000) ); + fprintf( stderr, "TestingSize: %d\n", forestData.testSize ); + fprintf( stderr, "ClassificationTime: %d\n", (unsigned int)(classificationTime_t*1000) ); + fprintf( stderr, "TestAccuracy: %3.2f\n", accuracy ); + fprintf( stderr, "NewtonIterationsCon: %d\n", nConIterations ); + fprintf( stderr, "NewtonConvergence: %d\n", (int)params.iflag ); + fprintf( stderr, "End of TestCase: %d\n", test_case_no); + fprintf( stderr, "\n\n\n"); + + //cleanup the dataset pointers here. + cleanup_dataset(&forestData, &devData ); + + test_case_no ++; + + //Cleanup host/device Here. + cuda_env_cleanup(&scratch); + + return 0; +} diff --git a/code/cuda/RC-FINAL-5/logistic_fn_indicator.cu b/code/cuda/RC-FINAL-5/logistic_fn_indicator.cu new file mode 100644 index 0000000..332e02e --- /dev/null +++ b/code/cuda/RC-FINAL-5/logistic_fn_indicator.cu @@ -0,0 +1,660 @@ +#include "logistic_fn_indicator.h" +#include "cuda_utils.h" + +#include "mat_functions.h" + +#include "gen_random.h" +#include "print_utils.h" + +#include "classification_kernels.h" + +void logistic_fn_indicator (real *features, SparseDataset *spFeatures, real *target, real *weights, real lambda, int rows, int cols, real *fn, real *devPtr, real *hostPtr) +{ + //host + real *alpha = hostPtr; + real *beta = alpha + 1; + real *nrm_weights = beta + 1; + + //device + real *t = devPtr; + real *out = t + rows; + real *redResult = out + rows; + + //features * weights + *alpha = 1; + *beta = 0; + + if (spFeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + alpha, features, rows, + weights, 1, + beta, t, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spFeatures->nnz, + alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + weights, beta, t ) ); + } + +/* + fprintf( stderr, "printing t " ); + printVector( t + rows - 1, 1, NULL); + fprintf( stderr, "printing target " ); + printVector( target + rows - 1, 1, NULL ); + fprintf( stderr, "printing out " ); + printVector( out + rows - 1,1, NULL ); +*/ + + ker_log_sum <<>> ( t, target, rows, out); + cudaThreadSynchronize (); + cudaCheckError (); + + ker_reduction <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> (out, redResult, rows); + cudaThreadSynchronize (); + cudaCheckError (); + + ker_reduction <<< 1, BLOCKS_POW_2, BLOCKS_POW_2 * sizeof(real) >>> (redResult, fn, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + //add the regularization term here. + cublasCheckError( cublasDnrm2( cublasHandle, cols, weights, 1, nrm_weights) ); + + //since we are minimizing this function here. + (*fn) += pow(*nrm_weights, 2.) * (lambda/2.0); +} + +// sigma( x_ij( y_i - g(z_i)) +// g(z_i) = sigmoid( x_ij * w_i ) + +void logistic_fn_indicator_gx (real *features, SparseDataset *spFeatures, real *target, real *weights, real lambda, int rows, int cols, real *gn, real *devPtr, real *hostPtr, int samplingType, int numFeatures) +{ + //device + real *t = devPtr; + + //host + real *alpha = hostPtr; + real *beta = alpha + 1; + + //blocks + int numBlocks = BLOCKS; + if (samplingType != 0) + numBlocks = rows / BLOCK_SIZE + ((rows % BLOCK_SIZE) == 0 ? 0 : 1); + + *alpha = 1; + *beta = 0; + if (spFeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + alpha, features, rows, + weights, 1, + beta, t, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spFeatures->nnz, + alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + weights, beta, t ) ); + } + + ker_sigmoid_target <<>> (t, target, rows, t); + cudaThreadSynchronize (); + cudaCheckError (); + + *alpha = 1; + *beta = 0; + if (spFeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_T, rows, cols, + alpha, features, rows, + t, 1, + beta, gn, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + rows, cols, spFeatures->nnz, + alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + t, beta, gn ) ); + } + + + // non uniform sampling scaling + *alpha = ((real)numFeatures)/((real)rows); + if (samplingType == 2) { + cublasCheckError( cublasDscal( cublasHandle, cols, alpha, gn, 1) ); + } else if (samplingType ==1 ){ + cublasCheckError( cublasDscal( cublasHandle, cols, alpha, gn, 1) ); + } + + //regularization here. + *alpha = lambda; + cublasCheckError( cublasDaxpy( cublasHandle, cols, alpha, weights, 1, gn, 1 ) ); +} + +GLOBAL void ker_hx_C_scale (real *A, real *B, real *C, int rows, real *scale ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < rows){ + C[ idx ] = (1. / scale[ idx ]) * ( A[ idx ] * B[ idx ] - B[ idx ] * ( A[ idx ] * B[ idx ] ) ); + } +} + +GLOBAL void ker_hx_C (real *A, real *B, real *C, int rows ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < rows){ + C[ idx ] = A[ idx ] * B[ idx ] - B[ idx ] * ( A[ idx ] * B[ idx ] ); + } +} + +void logistic_fn_indicator_hx_matvec (real *features, SparseDataset *spFeatures, real *weights, real *vector, + real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr, int samplingType, real *scaleTerms, int numFeatures) +{ + real *A = devPtr; + real *B = A + rows; + real *C = B + rows; + + real alpha, beta; + + //blocks + int numBlocks = BLOCKS; + if (samplingType != 0) + numBlocks = rows / BLOCK_SIZE + ((rows % BLOCK_SIZE) == 0 ? 0 : 1); + + //compute A = matrix * vector + alpha = 1; + beta = 0; + if (spFeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + &alpha, features, rows, + vector, 1, + &beta, A, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spFeatures->nnz, + &alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + vector, &beta, A ) ); + } + + //compute B = Probability Matrix here. matrix * weights + if (spFeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + &alpha, features, rows, + weights, 1, + &beta, B, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spFeatures->nnz, + &alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + weights, &beta, B ) ); + } + + ker_sigmoid<<>> + (B, rows, B); + cudaThreadSynchronize (); + cudaCheckError (); + + //Compute C = A.B - B.(A.B) + if (samplingType == 2) { + ker_hx_C_scale <<< numBlocks, BLOCK_SIZE >>> (A, B, C, rows, scaleTerms); + } else { + ker_hx_C<<< numBlocks, BLOCK_SIZE >>> (A, B, C, rows); + } + cudaThreadSynchronize (); + cudaCheckError (); + + //compute X^T * C = matvec + if (spFeatures->valPtr == NULL) { + alpha = 1.0; + beta = 0; + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_T, rows, cols, + &alpha, features, rows, + C, 1, + &beta, hx, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + rows, cols, spFeatures->nnz, + &alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + C, &beta, hx ) ); + } + + //appropriate scaling + if (samplingType == 1){ + alpha = ((real)numFeatures/ ((real) rows)); + cublasCheckError ( cublasDscal( cublasHandle, cols, &alpha, hx, 1 ) ); + } + + //regularization here. + //this is a matrix operation. + int colBlockSize = BLOCK_SIZE; + int colBlocks = (cols % colBlockSize) == 0 ? (cols/colBlockSize) : (cols/colBlockSize + 1); + ker_hx_matvec_reg <<>> + (hx, lambda, vector, cols); + cudaThreadSynchronize (); + cudaCheckError (); + +} + + + +void logistic_fn_indicator_hx (real *features, SparseDataset *spFeatures, real *target, real *weights, real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr) +{ + //device + real *t = devPtr; + real *t_minus = t + rows; + real *C = t_minus + rows; + + //host + real *alpha = hostPtr; + real *beta = alpha + 1; + + *alpha = 1; + *beta = 0; + if ( spFeatures == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + alpha, features, rows, + weights, 1, + beta, t, 1 ) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spFeatures->nnz, + alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + weights, beta, t ) ); + } + cublasCheckError( cublasDcopy( cublasHandle, rows, t, 1, t_minus, 1 ) ); + + //apply sigmoid here. + ker_sigmoid<<>> + (t, rows, t); + cudaThreadSynchronize (); + cudaCheckError (); + + //fprintf( stderr, "Output from the sigmoid function \n"); + //printVector( t, rows, NULL); + + *alpha = -1; + cublasCheckError ( cublasDscal( cublasHandle, rows, alpha, t_minus, 1 ) ); + ker_sigmoid<<>> + (t_minus, rows, t_minus); + cudaThreadSynchronize (); + cudaCheckError (); + + //fprintf( stderr, "Output from the sigmoid function -t \n"); + //printVector( t_minus, rows, NULL); + + //element wise product of two vectors here. + ker_ele_vec_product <<< BLOCKS, BLOCK_SIZE >>> + ( t, t_minus, rows, t ); + cudaThreadSynchronize (); + cudaCheckError (); + //fprintf( stderr, "Output from the ele vector product\n"); + //printVector( t, rows, NULL); + + // perform the final mat * mat product here. + // perform diag(s * neg_s) * features. + if (spFeatures == NULL ){ + cublasCheckError (cublasDdgmm( cublasHandle, CUBLAS_SIDE_LEFT, + rows, cols, features, rows, + t, 1, + C, rows) ); + //perform the first. product( features^T x above_result); + *alpha = 1; + *beta = 0; + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, + cols, cols, rows, + alpha, features, rows, + C, rows, beta, hx, cols ) ); + } else { + //Not implemented here. + //since we are using matvec here for Hessian + ; + } + + //regularization here. + //this is a matrix operation. + int colBlockSize = BLOCK_SIZE; + int colBlocks = (cols % colBlockSize) == 0 ? colBlocks = cols/colBlockSize : colBlocks = cols/colBlockSize + 1; + //fprintf ( stderr, "Regularization BLOCKS --> %d and BlockSize -- > %d \n", colBlocks, colBlockSize ); + ker_mat_identity <<>> + (hx, lambda, cols); + //(hx, 2 * (lambda), cols); + cudaThreadSynchronize (); + cudaCheckError (); +} + +/////////////////////////////////// +//Non uniform subsampling code here. +/////////////////////////////////// + +int generateNonUniformSample_log( real *probs, real *scaleTerms, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr) +{ + int count = 0; + real *devIndices = devPtr + rows; + + getRandomVector( rows, NULL, devPtr); + + ker_compute_probs <<< BLOCKS, BLOCK_SIZE >>> + ( probs, rows, sampleSize, devPtr, devIndices ); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( hostPtr, devIndices, sizeof(real) * rows, + cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST); + + for (int i = 0; i < rows; i ++){ + if (hostPtr[i] != 0) + selIndices[ count ++] = i; + } + +//fprintf( stderr, "selected points for non uniform sampling is %d \n", count ); + + //prepare scaleTerms here. + cuda_memset( scaleTerms, 0, sizeof(real) * rows, 0x99 ); + cuda_memset( devIndices, 0, sizeof(real) * rows, 0x99 ); + copy_host_device( selIndices, devIndices, sizeof(int) * count, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + int blocks = count / BLOCK_SIZE + + ((count % BLOCK_SIZE) == 0 ? 0 : 1 ); + ker_init_scaleTerms <<< blocks, BLOCK_SIZE >>> + ( scaleTerms, count, probs, (int *)devIndices ); + cudaThreadSynchronize (); + cudaCheckError (); + + return count; +} + +void computeRowProbabilities_log( SparseDataset *spfeatures, real *features, int rows, int cols, + real *dHXW, real *rowNrms, real *probs, real *devPtr ) +{ + ker_compute_dHXW_nrm_log <<< BLOCKS, BLOCK_SIZE >>> + ( dHXW, rowNrms, rows); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduce to compute the sum + reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE * sizeof (real) >>> + (dHXW, devPtr, rows ); + cudaThreadSynchronize (); + cudaCheckError (); + + reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof (real) >>> + (devPtr, devPtr + BLOCK_SIZE, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + ker_normalize <<< BLOCKS, BLOCK_SIZE >>> + (dHXW, rows, devPtr + BLOCK_SIZE, probs ); + cudaThreadSynchronize (); + cudaCheckError (); +} + + +void computeRowNorms_log( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr ) +{ + if (features != NULL) { + ker_row_norms <<< BLOCKS, BLOCK_SIZE >>> + ( features, rows, cols, rowNrms ); + cudaThreadSynchronize (); + cudaCheckError (); + } else { + cudaMemcpy( spfeatures->valPtr, spfeatures->sortedVals, + sizeof(real) * spfeatures->nnz, cudaMemcpyDeviceToDevice ); + + int blocks = spfeatures->nnz / (BLOCK_SIZE) + + ((spfeatures->nnz % (BLOCK_SIZE)) == 0 ? 0 : 1 ); + ker_sqr_elements <<< blocks, BLOCK_SIZE >>> + (spfeatures->valPtr, spfeatures->nnz, 1, devPtr); + cudaThreadSynchronize (); + cudaCheckError (); + + //matvec here. for row sums + real alpha = 1.0; + real beta = 0; + + //init the vector here. + blocks = cols / BLOCK_SIZE + (( cols % BLOCK_SIZE == 0) ? 0 : 1 ); + ker_init_ones <<< blocks, BLOCK_SIZE >>> + ( devPtr , cols ); + cudaThreadSynchronize (); + cudaCheckError (); + + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->valPtr, spfeatures->rowCsrPtr, + spfeatures->colPtr, devPtr, &beta, rowNrms) + ); + ker_sqrt_elements <<< BLOCKS, BLOCK_SIZE >>> + ( rowNrms, rows); + cudaThreadSynchronize (); + cudaCheckError (); + } +} + +void computeHXW_log (SparseDataset *spfeatures, real *features, int rows, int cols, real *weights, real *B) { + real alpha; + real beta; + + alpha = 1.0; + beta = 0; + + if (spfeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + &alpha, features, rows, + weights, 1, + &beta, B, 1) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, spfeatures->colPtr, + weights, &beta, B ) ); + } + + //ker_sigmoid<<>> + ker_sigmoid<<>> + (B, rows, B); + cudaThreadSynchronize (); + cudaCheckError (); +} + + + + + +// +// +// PREDICTION HERE. For the Logistic Regression with Indicator random variable +// as the class label +// +void logistic_regression_predict( real *features, SparseDataset *spFeatures, real *weights, real *labels, real *hostLabels, int rows, int cols, real *accuracy, real *devPtr, real *hostPtr ) +{ + real alpha, beta; + real *sigmoid_predictions = devPtr; + real nrm; + int counter0, counter1; + + alpha = 1; + beta = 0; + if (spFeatures->valPtr == NULL) { + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, rows, cols, + &alpha, features, rows, + weights, 1, + &beta, sigmoid_predictions, 1 ) ); + } else { + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spFeatures->nnz, + &alpha, spFeatures->descr, spFeatures->sortedVals, spFeatures->rowCsrPtr, spFeatures->colPtr, + weights, &beta, sigmoid_predictions ) ); + } + + //apply the sigmoid function here. + int tblocks; + if (rows <= BLOCK_SIZE) + tblocks = 1; + else + tblocks = (rows % BLOCK_SIZE) == 0 ? rows / BLOCK_SIZE : (rows/BLOCK_SIZE) + 1; + + //ker_sigmoid_classify<<>> (sigmoid_predictions, rows); + ker_sigmoid_classify<<>> (sigmoid_predictions, rows); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( hostPtr, sigmoid_predictions, sizeof(real) * rows, cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST ); + + *accuracy = 0; + counter0 = counter1 = 0; + for (int i = 0; i < rows; i ++) + { + if (hostPtr[i] == (hostLabels[i] - 1.0)) (*accuracy) ++; + if (hostPtr[i] == 1.) counter1 ++; + if (hostPtr[i] == 0.) counter0 ++; + } + //fprintf( stderr, "0: %d, 1: %d \n", counter0, counter1 ); + + *accuracy = ((*accuracy) / rows) * 100.; +} + + + +//////////////////////////////////////////////////////// +//Derivative Test +//////////////////////////////////////////////////////// +/* +void getRandomVectorLogistic (int n, real *hostPtr, real *devPtr) { + + curandGenerator_t gen ; + int m = n + n % 2; + + curandCheckError ( curandCreateGenerator (&gen , CURAND_RNG_PSEUDO_DEFAULT ) ); + + curandCheckError ( curandSetPseudoRandomGeneratorSeed ( gen , 1234ULL )) ; + + curandCheckError ( curandGenerateNormalDouble ( gen , devPtr , m, 0, .25)) ; + //curandCheckError ( curandGenerateUniformDouble ( gen , devPtr , m)) ; + + copy_host_device( hostPtr, devPtr, sizeof(real) * m, cudaMemcpyDeviceToHost, + ERROR_MEMCPY_DEVICE_HOST ); + + curandCheckError ( curandDestroyGenerator ( gen ) ); +} +*/ + + +void logisticRegDerivativeTest ( real *features, real *target, int rows, int cols, + real *devPtr, real *hostPtr, real *pageLckPtr, int numpoints) +{ + int offset = cols % 4; + + real *constPoint = hostPtr; + real *hostPoint = constPoint + cols + offset; + real *dx = hostPoint + cols + offset; + real *ferror = dx + cols + offset; + real *herror = ferror + numpoints; + real *dxs = herror + numpoints; + real *nextHostPtr = dxs + numpoints; + + real *devPoint = devPtr; + real *devDx = devPoint + cols + offset; + real *gradient = devDx + cols + offset; + real *hessian = gradient + cols + offset; + real *nextDevPtr = hessian + cols * cols + offset; + + real *vv = pageLckPtr; + real *vhv = vv + 1; + real *dxnrm = vhv + 1; + real *f = dxnrm + 1; + real *f0 = f + 1; + real *nextPagePtr = f0 + 1; + + real alpha, beta; + + fprintf( stderr, "Number of random numbers to be generated: %d \n", cols ); + + memset( constPoint, 0, sizeof(real) * cols ); + for (int i = 0; i < cols; i ++) constPoint[i] = 0.; + + copy_host_device( constPoint, devPoint, sizeof(real) * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + //getRandomVectorLogistic( cols, dx, nextDevPtr); + getRandomVector( cols, dx, nextDevPtr); + //for (int i = 0; i < cols; i ++) dx[i] = 0; + + //printHostVector( dx, cols ); + + //f0 + //logistic_fn_indicator( features, target, devPoint, 0, rows, cols, f0, nextDevPtr, nextHostPtr); + + //g0 + //logistic_fn_indicator_gx( features, NULL, target, devPoint, 0, rows, cols, gradient, nextDevPtr, nextHostPtr); + //printVector( gradient, 5, NULL ); + + //h0 + //logistic_fn_indicator_hx( features, target, devPoint, 0, rows, cols, hessian, nextDevPtr, nextHostPtr ); + + fprintf( stderr, "Starting the derivative test .. %f\n", *f0); + + for (int i = 0; i < numpoints; i ++) { + + for (int j = 0; j < cols; j ++) hostPoint[j] = constPoint[j] + dx[j]; + + copy_host_device( hostPoint, devPoint, sizeof(real) * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_DEVICE_HOST); + copy_host_device( dx, devDx, sizeof(real) * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + //function evaluation here. + //logistic_fn_indicator( features, target, devPoint, 0, rows, cols, f, nextDevPtr, nextHostPtr); + + //first order error + //printVector( gradient, 5, NULL ); + //printVector( devPoint, 5, NULL ); + //fprintf( stderr, "Gradient sum: %e \n", computeWeightSum( gradient, cols )); + cublasCheckError( cublasDdot( cublasHandle, cols, gradient, 1, devDx, 1, vv) ); + ferror[i] = (*f - (*f0 + *vv)) / (real)rows; + + //second order error + alpha = 1; + beta = 0; + cublasCheckError( cublasDgemv( cublasHandle, CUBLAS_OP_N, cols, cols, + &alpha, hessian, cols, + devDx, 1, + &beta, nextDevPtr, 1) ); + *vhv= 0; + cublasCheckError( cublasDdot( cublasHandle, cols, devDx, 1, nextDevPtr, 1, vhv) ); + + herror[i] = (*f - (*f0 + *vv + 0.5 * (*vhv) )) / (real) rows; + + fprintf( stderr, "%d: f --> %e, vv --> %e, vhv--> %e, ferr: %e, herr: %e \n", + i, *f, *vv, *vhv, ferror[i], herror[i] ); + + //dxs here. + *dxnrm = 0; + cublasCheckError( cublasDnrm2( cublasHandle, cols, devDx, 1, dxnrm)); + dxs[i] = *dxnrm; + //printVector( devDx, 10, NULL); + //fprintf( stderr, "DevDx norm is ----> %e, %e, %e \n", *dxnrm, pow( *dxnrm, 2.), pow(*dxnrm, 3.) ); + + for (int j = 0; j < cols; j ++) dx[j] = dx[j] / 2.0; + //break; + } + + writeVector( ferror, numpoints, "./ferror.txt", 1 ); //host + writeVector( herror, numpoints, "./herror.txt", 1 ); //host + + //write dx.^2 here + for (int j = 0; j < numpoints; j ++) constPoint[j] = pow(dxs[j], 2.); + writeVector( constPoint, numpoints, "./dxs_2.txt", 1 ); //host + + //write dx.^3 here + for (int j = 0; j < numpoints; j ++) constPoint[j] = pow(dxs[j], 3.); + writeVector( constPoint, numpoints, "./dxs_3.txt", 1 ); //host +} + + diff --git a/code/cuda/RC-FINAL-5/logistic_fn_indicator.h b/code/cuda/RC-FINAL-5/logistic_fn_indicator.h new file mode 100644 index 0000000..e6f9663 --- /dev/null +++ b/code/cuda/RC-FINAL-5/logistic_fn_indicator.h @@ -0,0 +1,28 @@ + +#ifndef __H_LOGISTIC_FN_INDICATOR__ +#define __H_LOGISTIC_FN_INDICATOR__ + +#include "cuda_types.h" +#include "dataset.h" + +void logistic_fn_indicator (real *features, SparseDataset *spfeatures, real *target, real *weights, real lambda, int rows, int cols, real *fx, real *devPtr, real *hostPtr); +void logistic_fn_indicator_gx (real *features, SparseDataset *spfeatures, real *target, real *weights, real lambda, int rows, int cols, real *gx, real *devPtr, real *hostPtr, int samplingType, int numFeatures); +void logistic_fn_indicator_hx_matvec (real *features, SparseDataset *spFeatures, real *weights, real *vector, + real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr, int type, real *scale, int numFeatures); +void logistic_fn_indicator_hx (real *features, real *target, real *weights, real lambda, int rows, int cols, real *hx, real *devPtr, real *hostPtr); +void logistic_regression_predict( real *, SparseDataset *, real *, real *, real *, int , int , real *, real *, real *); + +void logisticRegDerivativeTest ( real *features, real *target, int rows, int cols, + real *devPtr, real *hostPtr, real *pageLckPtr, int numpoints); + + +//Non uniform functions +int generateNonUniformSample_log( real *probs, real *scaleTerms, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr); +void computeRowProbabilities_log( SparseDataset *spfeatures, real *features, int rows, int cols, + real *dHXW, real *rowNrms, real *probs, real *devPtr ); +void computeRowNorms_log( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr ); +void computeHXW_log (SparseDataset *spfeatures, real *features, int rows, int cols, real *weights, real *B ); + + + +#endif diff --git a/code/cuda/RC-FINAL-5/mat_functions.cu b/code/cuda/RC-FINAL-5/mat_functions.cu new file mode 100644 index 0000000..99e939a --- /dev/null +++ b/code/cuda/RC-FINAL-5/mat_functions.cu @@ -0,0 +1,126 @@ +#include "mat_functions.h" +#include "cuda.h" +#include "cuda_runtime.h" + +GLOBAL void ker_log_sum( real *t, real *target, int N, real *out) +{ + //extern __shared__ real sdata[]; + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if (idx < N) { + x = t[ idx ]; + if (x <= 0) + out[ idx ] = log( 1. + exp(x) ) - ((target[idx] - 1.) * t[ idx ]); + else + out[ idx ] = ( x + log( exp(-x) + 1.) ) - ((target[idx] - 1.) * t[ idx] ); + } +} + +GLOBAL void ker_sigmoid( real *s, int N, real *out) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + real alpha = 0; + + if (idx < N) { + x = s[ idx ]; + if ( x < 0 ) + out[ idx ] = exp( x ) / (1. + exp(x) ); + else + out[ idx ] = 1. / (1. + exp(-x) ); + } +} + +GLOBAL void ker_sigmoid_classify( real *s, int N ) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < N) { + if (s[ idx ] <= 0 ){ + if (exp(s[idx])/ ( (1. + exp(s[idx]) )) > 0.5) + s[idx] = 1.; + else + s[idx] = 0.; + } else { + if (1. / (1. + exp(-s[idx]) ) > 0.5) + s[idx] = 1.; + else + s[idx] = 0.; + } + } +} + +GLOBAL void ker_sigmoid_target( real *t, real *target, int N, real *out) +{ + real x = 0; + real alpha = 0; + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < N) { + x = t[ idx ]; + if (x < 0 ) + out[idx] = ( exp(x)/ ( 1. + exp(x) )) - (target[ idx ] - 1.); + else + out[idx] = ( 1./ ( 1. + exp(-x) )) - (target[ idx ] - 1.); + } +} + +GLOBAL void ker_ele_vec_product( real *t1, real *t2, int N, real *out) +{ + //extern __shared__ real sdata[]; + //real x = 0; + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < N) out[ idx ] = t1[ idx ] * t2[ idx ]; + //sdata[ threadIdx.x ] = x; + //if (idx < N) out[idx] = sdata[threadIdx.x] ; +} + +GLOBAL void ker_mat_identity( real *matrix, real gamma, int M) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < M) + matrix[ idx * M + idx ] += gamma; +} + +GLOBAL void ker_hx_matvec_reg ( real *hx, real gamma, real *vec, int c) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < c) { + hx[ idx ]+= gamma * vec[ idx ]; + } +} + + +GLOBAL void ker_reduction(const real *input, real *per_block_results, int n) +{ + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = input[i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } +} + diff --git a/code/cuda/RC-FINAL-5/mat_functions.h b/code/cuda/RC-FINAL-5/mat_functions.h new file mode 100644 index 0000000..9956166 --- /dev/null +++ b/code/cuda/RC-FINAL-5/mat_functions.h @@ -0,0 +1,15 @@ +#ifndef __H_MAT_FUNCTIONS__ +#define __H_MAT_FUNCTIONS__ + +#include "cuda_types.h" + +GLOBAL void ker_log_sum( real *t, real *target, int N, real *out); +GLOBAL void ker_sigmoid( real *target, int N, real *out); +GLOBAL void ker_sigmoid_classify( real *target, int N ); +GLOBAL void ker_sigmoid_target( real *t, real *target, int N, real *out); +GLOBAL void ker_ele_vec_product( real *t1, real *t2, int N, real *out); +GLOBAL void ker_mat_identity (real *h, real reg_term, int M); +GLOBAL void ker_hx_matvec_reg ( real *hx, real gamma, real *vec, int c); +GLOBAL void ker_reduction(const real *h, real *out, int dim); + +#endif diff --git a/code/cuda/RC-FINAL-5/newton-driver.c b/code/cuda/RC-FINAL-5/newton-driver.c new file mode 100644 index 0000000..6115730 --- /dev/null +++ b/code/cuda/RC-FINAL-5/newton-driver.c @@ -0,0 +1,345 @@ +#include +#include + +#include "dataset.h" +#include "sparse_dataset.h" + +#include "cuda_environment.h" +#include "newton_cg.h" +#include "utils.h" +#include "cuda_utils.h" +#include "logistic_fn_indicator.h" + +#include "softmax_multiclass.h" + +cublasHandle_t cublasHandle; +cusparseHandle_t cusparseHandle; +int BLOCKS, BLOCK_SIZE, BLOCKS_POW_2; +int HESSIAN_SAMPLING_SIZE, GRADIENT_SAMPLING_SIZE; +void *dscratch; + +int main(int argc, char **argv){ + + // Data variables. + ForestDataset forestData; + DeviceDataset devData; + SCRATCH_AREA scratch; + NEWTON_CG_PARAMS params; + + real trainingTime_s, classificationTime_s; + real trainingTime_t, classificationTime_t; + int test_case_no = 1; + int nConIterations; + int DATASET_TYPE = 1; + + double l = 1e-6; + int max_cg_iterations = -1; + double cg_tolerance = 0; + int sampling_flag = 0; + int gpu = -1; + + if (argc <= 7) { + fprintf( stderr, " dataset lambda .... is the commnad \n"); + exit (-1); + } + + DATASET_TYPE = atoi( argv[1] ); + l = atof ( argv[2] ); + max_cg_iterations = atoi (argv[3] ); + cg_tolerance = atof( argv[4] ); + sampling_flag = atoi( argv[5] ); + gpu = atoi( argv[6] ); + HESSIAN_SAMPLING_SIZE = atoi( argv[7] ); + GRADIENT_SAMPLING_SIZE = atoi (argv[8] ); + + fprintf( stderr, "Dataset: %d, Lambda: %e, CGIterations: %d, CGTolerange: %e, SubSampling: %d, GPU: %d, HSample: %d, GSample: %d \n", + DATASET_TYPE, l, max_cg_iterations, cg_tolerance, sampling_flag, gpu, HESSIAN_SAMPLING_SIZE, GRADIENT_SAMPLING_SIZE ); + + + // Create the CUDA Environment Here. + // Memory and device settings here. + cuda_env_init (&scratch, gpu); + #ifdef __debug__ + fprintf( stderr, "Scratch Area initialized ... \n"); + #endif + + + switch( DATASET_TYPE ) { + + case 1: + readMultiDataset ( + "/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_features.txt", + "/home/skylasa/solvers/dataset/raw-data/uci-covertype/train_forest_multi_labels.txt", + "/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_features.txt", + "/home/skylasa/solvers/dataset/raw-data/uci-covertype/test_forest_multi_labels.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 11: + readMultiDataset ( + "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_features.txt", + "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/train_forest_multi_labels.txt", + "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_features.txt", + "/home/skylasa/solvers/dataset/normalized-data/uci-covertype/test_forest_multi_labels.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 2: + readMultiDataset ( + "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/drive-diagnostics/test_vec.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 12: + readMultiDataset ( + "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/train_vec.txt", + "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/drive-diagnostics/test_vec.txt", + &forestData, &scratch, 0, 0 ); + break; + + case 3: + readMultiDataset ( + "/home/skylasa/solvers/dataset/raw-data/mnist/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/mnist/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/mnist/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/mnist/test_vec.txt", + &forestData, &scratch, 1, 0 ); + break; + + case 13: + readMultiDataset ( + "/home/skylasa/solvers/dataset/normalized-data/mnist/train_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/mnist/train_vec.txt", + "/home/skylasa/solvers/dataset/normalized-data/mnist/test_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/mnist/test_vec.txt", + &forestData, &scratch, 1, 0 ); + break; + + case 4: + readCIFARDataset( + "/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/", + "data_batch_", "test_batch.bin", + &forestData, &scratch, 1 ); + break; + + case 14: + readCIFARDataset( + "/home/skylasa/solvers/dataset/raw-data/cifar-10/cifar-10-batches-bin/", + "data_batch_", "test_batch.bin", + &forestData, &scratch, 0 ); + break; + + case 5: + readNewsgroupsDataset ( + "/home/skylasa/solvers/dataset/raw-data/newsgroups/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/newsgroups/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/newsgroups/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/newsgroups/test_vec.txt", + &forestData, &scratch, 0 ); + break; + + case 15: + readNewsgroupsDataset ( + "/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/newsgroups/train_vec.txt", + "/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/newsgroups/test_vec.txt", + &forestData, &scratch, 0 ); + break; + + //Logistic Datasets Here + case 6: + readMultiDataset ( + "/home/skylasa/solvers/dataset/raw-data/mushrooms/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/mushrooms/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/mushrooms/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/mushrooms/test_vec.txt", + &forestData, &scratch, 1, 0 ); + break; + case 16: + readMultiDataset ( + "/home/skylasa/solvers/dataset/normalized-data/mushrooms/train_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/mushrooms/train_vec.txt", + "/home/skylasa/solvers/dataset/normalized-data/mushrooms/test_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/mushrooms/test_vec.txt", + &forestData, &scratch, 1, 0 ); + break; + + case 7: + readMultiDataset ( + "/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/ijcnn1/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/ijcnn1/test_vec.txt", + &forestData, &scratch, 1, 0 ); + break; + case 17: + readMultiDataset ( + "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/train_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/train_vec.txt", + "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/test_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/ijcnn1/test_vec.txt", + &forestData, &scratch, 1, 0 ); + break; + case 8: + readMultiDataset ( + "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_train.data", + "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_train.labels01", + "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_valid.data", + "/home/skylasa/solvers/dataset/raw-data/gisette/gisette_valid.labels01", + &forestData, &scratch, 1, 0 ); + break; + case 18: + readMultiDataset ( + "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_train.data", + "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_train.labels01", + "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_valid.data", + "/home/skylasa/solvers/dataset/normalized-data/gisette/gisette_valid.labels01", + &forestData, &scratch, 1, 0 ); + break; + case 9: + readNewsgroupsDataset ( + "/home/skylasa/solvers/dataset/raw-data/rcv1/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/rcv1/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/rcv1/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/rcv1/test_vec.txt", + &forestData, &scratch, 1 ); + break; + + //Sparse Logistic Datasets Here + case 10: + readNewsgroupsDataset ( + "/home/skylasa/solvers/dataset/raw-data/real-sim/train_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/real-sim/train_vec.txt", + "/home/skylasa/solvers/dataset/raw-data/real-sim/test_mat.txt", + "/home/skylasa/solvers/dataset/raw-data/real-sim/test_vec.txt", + &forestData, &scratch, 1 ); + break; + case 20: + readNewsgroupsDataset ( + "/home/skylasa/solvers/dataset/normalized-data/real-sim/train_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/real-sim/train_vec.txt", + "/home/skylasa/solvers/dataset/normalized-data/real-sim/test_mat.txt", + "/home/skylasa/solvers/dataset/normalized-data/real-sim/test_vec.txt", + &forestData, &scratch, 1 ); + break; + } + + #ifdef __debug__ + fprintf( stderr, "Done with initialization of the dataset .... \n"); + fprintf( stderr, "Blocks for %d data points... \n", forestData.rows); + #endif + + compute_blocks (&BLOCKS, &BLOCK_SIZE, forestData.trainSize); + compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2); + if (BLOCKS_POW_2 < 32) BLOCKS_POW_2 = 32; + #ifdef __debug__ + fprintf ( stderr, "Blocks: %d, BlockSize: %d, Power_2: %d\n", BLOCKS, BLOCK_SIZE, BLOCKS_POW_2); + #endif + + + // Move the data to the Device. + if ((forestData.trainSet == NULL) && (forestData.testSet == NULL)) + { + initialize_device_data_sparse( &forestData, &devData ); + initMatDescriptors ( &devData ); + convertToCSR ( &devData, scratch.devWorkspace ); + + initMatDescriptorsForSampling( &devData ); + initMatDescriptorsForSparseSampling( &devData ); + } else { + initialize_device_data( &forestData, &devData ); + initMatDescriptorsForSampling( &devData ); + } + + #ifdef __debug__ + fprintf( stderr, "Inittialized the Device with the dataset ... \n"); + #endif + + //Train the dataset here. + params.max_iterations = 100; + params.tolerance = 1e-5; + params.iflag = 0; + + params.lambda = l; + params.max_cg_iterations = max_cg_iterations; + params.cg_tolerance = cg_tolerance; + + if (GRADIENT_SAMPLING_SIZE == 100) + params.gx_sampling = 0; + else + params.gx_sampling = sampling_flag; + params.hx_sampling = sampling_flag; + +fprintf( stderr, " Gradient Sample: %d, Hessian Sample: %d \n", devData.gradientSampleSize, devData.hessianSampleSize ); + if (sampling_flag == 0) { + + devData.gradientSampleSize = 0; + devData.hessianSampleSize = 0; + } + + fprintf( stderr, "Start of TestCase: %d\n", test_case_no); + trainingTime_s = Get_Time (); + /* + if (forestData.numclasses == 1) + nConIterations = newton_cg( &forestData, &devData, ¶ms, &scratch ); + else + */ + nConIterations = newton_cg_multi_optimized( &forestData, &devData, ¶ms, &scratch); + trainingTime_t = Get_Timing_Info( trainingTime_s ); + #ifdef __debug__ + fprintf( stderr, "Done with training .... \n"); + #endif + + //exit (-1); + + //Predict the testing set here. + real accuracy = 0; + classificationTime_s = Get_Time (); + /* + if (forestData.numclasses == 1) { + logistic_regression_predict( devData.testSet, &devData.spTest, devData.weights, devData.testLabels, + forestData.testLabels, forestData.testSize, forestData.cols, + &accuracy, scratch.devWorkspace, scratch.hostWorkspace ); + } else { + */ + accuracy = softmax_predict(&devData.spTest, devData.testSet, forestData.testLabels, + devData.weights, devData.testSize, devData.cols, devData.numclasses, + scratch.hostWorkspace, scratch.devWorkspace, 1, forestData.testSet); + //} + classificationTime_t = Get_Timing_Info( classificationTime_s ); + //fprintf( stderr, "Start of TestCase: %d\n", test_case_no); + fprintf( stderr, "Dataset: %d \n", DATASET_TYPE ); + fprintf( stderr, "NumClasses: %d\n", devData.numclasses ); + fprintf( stderr, "Lambda: %e\n", params.lambda ); + fprintf( stderr, "NewtonIterations: %d\n", params.max_iterations); + fprintf( stderr, "NewtonTolerance: %e\n", params.tolerance); + fprintf( stderr, "CGIterations: %d\n", params.max_cg_iterations); + fprintf( stderr, "CGTolerance: %e\n", params.cg_tolerance ); + fprintf( stderr, "DataSetSize: %d\n", forestData.rows ); + fprintf( stderr, "TrainingSize: %d\n", forestData.trainSize); + fprintf( stderr, "Features: %d\n", forestData.cols ); + fprintf( stderr, "TrainingTime: %d\n", (unsigned int)(trainingTime_t * 1000) ); + fprintf( stderr, "TestingSize: %d\n", forestData.testSize ); + fprintf( stderr, "ClassificationTime: %d\n", (unsigned int)(classificationTime_t*1000) ); + fprintf( stderr, "TestAccuracy: %3.2f\n", accuracy ); + fprintf( stderr, "NewtonIterationsCon: %d\n", nConIterations ); + fprintf( stderr, "NewtonConvergence: %d\n", (int)params.iflag ); + fprintf( stderr, "End of TestCase: %d\n", test_case_no); + fprintf( stderr, "\n\n\n"); + + //cleanup the dataset pointers here. + cleanup_dataset(&forestData, &devData ); + + test_case_no ++; + + //Cleanup host/device Here. + cuda_env_cleanup(&scratch); + + return 0; +} diff --git a/code/cuda/RC-FINAL-5/newton_cg.c b/code/cuda/RC-FINAL-5/newton_cg.c new file mode 100644 index 0000000..15835fa --- /dev/null +++ b/code/cuda/RC-FINAL-5/newton_cg.c @@ -0,0 +1,425 @@ +#include +#include "logistic_fn_indicator.h" + +#include "cuda_utils.h" +#include "conjugate_gradient.h" +#include "linesearch.h" + +#include "print_utils.h" +#include "logistic_fn_indicator.h" +#include "utils.h" + +#include "softmax_multiclass.h" +#include "subsampling_helpers.h" +#include "sparse_dataset.h" + +#define ALLOTED_TIME (120 * 60) + +int newton_cg( ForestDataset *host, DeviceDataset *data, NEWTON_CG_PARAMS *params, SCRATCH_AREA *scratch){ + + int iterations, cg_iterations; + real snorm, gxnorm, rel_residual, best_rel_residual; + real alpha, alphak; + + real train_accuracy, test_accuracy; + real iteration_start, iteration_total, simulation_total; + + //device + real *devPtr = (real *)scratch->devWorkspace; + real *xx = devPtr; + real *s = xx + data->cols; + real *s_best = s + data->cols; + real *gradient = s_best + data->cols; + //real *hessian = gradient+ data->cols; + //real *nextDevPtr = hessian + (data->cols * data->cols); + real *nextDevPtr = gradient + data->cols; + + real *nextHostPtr = (real *)scratch->hostWorkspace; + + //pageLock + real *train_function, *test_function; + train_function = scratch->pageLckWorkspace; + test_function = & (scratch->pageLckWorkspace[1] ); + + //Subsampling here. + //extract the subsampled gradient here. +fprintf( stderr, "Running the Logistic Regression..... solver, %d, %d, %d, %d \n", data->rows, data->cols, data->numclasses, data->testSize); + + //1. get the hessian and gradient. + if (params->gx_sampling >= 1) { + + data->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * data->rows) / 100; + + if (data->trainSet != NULL && data->testSet != NULL) { + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, + data->numclasses, data->sampledGradientTrainSet, data->gradientSampleSize); + } else { + //handle sparse datasets here. + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + + sampleSparseDataset( &data->spGradientSample, &data->spTrain, + data->rows, data->cols, data->numclasses, + &data->spSampledGradientTrain, data->gradientSampleSize ); + fprintf( stderr, "Done extracting the sparse dataset ..... \n"); + } + logistic_fn_indicator_gx( data->sampledGradientTrainSet, &data->spSampledGradientTrain, data->sampledGradientTrainLabels, + data->weights, params->lambda, data->gradientSampleSize, data->cols, gradient, + nextDevPtr, nextHostPtr, params->gx_sampling, data->rows ); + + } else { + logistic_fn_indicator_gx( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, + data->rows, data->cols, gradient, nextDevPtr, nextHostPtr, params->gx_sampling, data->rows); + } + + //norm of gradient. + cublasCheckError( cublasDnrm2( cublasHandle, data->cols, gradient, 1, &gxnorm )); + + iterations = 0; + snorm = 100; + //gxnorm = 100; + + rel_residual = 0; + best_rel_residual = 0; + train_accuracy = 0; + *train_function = 0; + test_accuracy = 0; + *test_function = 0; + iteration_total = 0; + simulation_total = 0; + +#ifdef __debug__ + fprintf( stderr, "iteration \t norm(gradient) \t Rel_Residual \t CG-ITERATIONS \t Train_Accu \t Obj_Val_Train \t Test_Accu \t Obj_Val_Test \n"); + + logistic_regression_predict( data->trainSet, &data->spTrain, data->weights, data->trainLabels, + host->trainLabels, host->trainSize, host->cols, + &train_accuracy, nextDevPtr, nextHostPtr ); + logistic_regression_predict( data->testSet, &data->spTest, data->weights, data->testLabels, + host->testLabels, host->testSize, host->cols, + &test_accuracy, nextDevPtr, nextHostPtr ); + + logistic_fn_indicator( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, data->rows, data->cols, train_function, nextDevPtr, nextHostPtr); + logistic_fn_indicator( data->testSet, &data->spTest, data->testLabels, data->weights, params->lambda, data->testSize, data->cols, test_function, nextDevPtr, nextHostPtr); + + fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", + iterations, gxnorm, rel_residual, 0, train_accuracy, *train_function, + test_accuracy, *test_function, (unsigned int)(iteration_total * 1000) ); +#endif + + while (iterations < params->max_iterations){ + + iteration_start = Get_Time( ); + + //alpha = -1.; + //cublasCheckError ( cublasDscal( cublasHandle, data->cols, &alpha, gradient, 1) ); + + //conjugate gradient to solve Hx = gradient here. + cuda_memset( s_best, 0, data->cols, ERROR_MEM_SET ); + cuda_memset( s, 0, data->cols, ERROR_MEM_SET ); + cg_iterations = Cublas_CG_Logistic( data, params, gradient, s, s_best, &rel_residual, + nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); + + alphak = cg_linesearch( s_best, data->weights, 0.5, 1e-6, &data->spTrain, + (real *)data->trainSet, (real *)data->trainLabels, + params->lambda, data->rows, data->cols, data->numclasses, + gradient, xx, nextDevPtr, nextHostPtr, (real *)scratch->pageLckWorkspace); + +//fprintf( stderr, "alphaK --> %e \n", alphak ); + + alpha = alphak; + cublasCheckError( cublasDaxpy( cublasHandle, data->cols, &alpha, s_best, 1, data->weights, 1) ); + + if (params->gx_sampling >= 1) { + if (data->trainSet != NULL && data->testSet != NULL) { + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, + data->numclasses, data->sampledGradientTrainSet, data->gradientSampleSize); + } else { + //handle sparse datasets here. + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + + sampleSparseDataset( &data->spGradientSample, &data->spTrain, + data->rows, data->cols, data->numclasses, + &data->spSampledGradientTrain, data->gradientSampleSize ); + } + logistic_fn_indicator_gx( data->sampledGradientTrainSet, &data->spSampledGradientTrain, data->sampledGradientTrainLabels, + data->weights, params->lambda, data->gradientSampleSize, data->cols, gradient, + nextDevPtr, nextHostPtr, params->gx_sampling, data->rows ); + + } else { + logistic_fn_indicator_gx( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, + data->rows, data->cols, gradient, nextDevPtr, nextHostPtr, params->gx_sampling, data->rows); + } + + cublasCheckError( cublasDnrm2( cublasHandle, data->cols, gradient, 1, &gxnorm )); + cublasCheckError( cublasDnrm2( cublasHandle, data->cols, s_best, 1, &snorm)); + +#ifdef __debug__ + + iteration_total = Get_Timing_Info( iteration_start ); + simulation_total += iteration_total; + + logistic_regression_predict( data->trainSet, &data->spTrain, data->weights, data->trainLabels, + host->trainLabels, host->trainSize, host->cols, + &train_accuracy, nextDevPtr, nextHostPtr ); + logistic_regression_predict( data->testSet, &data->spTest, data->weights, data->testLabels, + host->testLabels, host->testSize, host->cols, + &test_accuracy, nextDevPtr, nextHostPtr ); + + logistic_fn_indicator( data->trainSet, &data->spTrain, data->trainLabels, data->weights, params->lambda, + data->rows, data->cols, train_function, nextDevPtr, nextHostPtr); + logistic_fn_indicator( data->testSet, &data->spTest, data->testLabels, data->weights, params->lambda, + data->testSize, data->cols, test_function, nextDevPtr, nextHostPtr); + + fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", + iterations+1, gxnorm, rel_residual, cg_iterations, train_accuracy, *train_function, + test_accuracy, *test_function, (unsigned int)(iteration_total * 1000) ); + +#endif + + iterations ++; + if (gxnorm <= params->tolerance) break; + + if (((unsigned int)(simulation_total)) >= ALLOTED_TIME ) { + fprintf( stderr, "Exceeded the Time limitation for the simulation..... %d, %d \n", ((unsigned int)(simulation_total )), ALLOTED_TIME ); + break; + } + } + + if (gxnorm >= params->tolerance) + params->iflag = 1; + + return iterations; +} + + + +int newton_cg_multi_optimized( ForestDataset *host, DeviceDataset *data, NEWTON_CG_PARAMS *params, SCRATCH_AREA *scratch){ + + int iterations, cg_iterations; + real snorm, gxnorm, rel_residual, temp; + real alpha, alphak; + + real best_rel_residual; + +#ifdef __STATISTICS__ + //statistics here. + real train_accuracy, train_function; + real test_accuracy, test_function; + real iteration_start, iteration_total, simulation_total; +#endif + + int classes_to_solve = data->numclasses; + + //device + real *xx = (real *)scratch->devWorkspace; + real *s = xx + data->cols * classes_to_solve; + real *s_best = s + data->cols * classes_to_solve; + + //auxiliary storeage + real *gradient = s_best + data->cols * classes_to_solve; + real *Hv = gradient + data->cols * classes_to_solve; + real *HXW = Hv + classes_to_solve * data->cols; + //real *expSumVec = XW + rows * classes_to_solve; + + //scratch area + real *nextDevPtr = HXW + data->rows* classes_to_solve; + real *nextHostPtr = (real *)scratch->hostWorkspace; + real *nextPageLckPtr = (real *) scratch->pageLckWorkspace; + + real s_norm, s_best_norm; + + + //1. get the hessian and gradient. + if (params->hx_sampling >= 1) + data->hessianSampleSize = (HESSIAN_SAMPLING_SIZE * data->rows)/100; + + if (params->gx_sampling >= 1) { + + data->gradientSampleSize = (GRADIENT_SAMPLING_SIZE * data->rows) / 100; + data->spGradientSample.nnz = data->gradientSampleSize; + + + if (data->trainSet != NULL && data->testSet != NULL) { + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, + classes_to_solve, data->sampledGradientTrainSet, data->gradientSampleSize); + } else { + //handle sparse datasets here. + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + + sampleSparseDataset( &data->spGradientSample, &data->spTrain, + data->rows, data->cols, classes_to_solve, + &data->spSampledGradientTrain, data->gradientSampleSize ); + } + + softmax_multiclass_gx_subsampled(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols, + classes_to_solve, data->weights, params->lambda, + gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace, + &data->spGradientSample, data->sampledGradientTrainSet, &data->spSampledGradientTrain, + data->sampledGradientTrainLabels, data->gradientSampleSize, params->gx_sampling); + printVector( gradient, 10, NULL ); + + } else { + computeHXW(&data->spTrain, data->trainSet, data->rows, data->cols, classes_to_solve, data->weights, HXW, 0 ); + + softmax_multiclass_gx_optimized(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols, + classes_to_solve, data->weights, params->lambda, HXW, + gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace); + } + //printVector( gradient, 20, NULL ); + /* + softmax_multiclass_gx(data->trainSet, data->trainLabels, data->rows, data->cols, + classes_to_solve, data->weights, params->lambda, + gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace); + */ + + //2. Initialization Here. + iterations = 0; + snorm = 100; + gxnorm = 100; + rel_residual = 100; + + cublasCheckError( cublasDnrm2( cublasHandle, classes_to_solve * data->cols, gradient, 1, &gxnorm )); + +#ifdef __STATISTICS__ + iteration_total = 0; + simulation_total = 0; + + test_function = softmax_multiclass_fx (&data->spTest, data->testSet, data->testLabels, data->testSize, data->cols, + classes_to_solve, data->weights, params->lambda, + nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); + train_function = softmax_multiclass_fx (&data->spTrain, data->trainSet, data->trainLabels, host->trainSize, data->cols, + classes_to_solve, data->weights, params->lambda, + nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); + + + test_accuracy = softmax_predict(&data->spTest, data->testSet, host->testLabels, data->weights, data->testSize, + data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL); + train_accuracy = softmax_predict( &data->spTrain, data->trainSet, host->trainLabels, data->weights, host->trainSize, + data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL ); + + + fprintf( stderr, "iteration \t norm(gradient) \t Rel_Residual \t CG-ITERATIONS \t Train_Accu \t Obj_Val_Train \t Test_Accu \t Obj_Val_Test \n"); + fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", + iterations, gxnorm, rel_residual, 0, train_accuracy, train_function, + test_accuracy, test_function, (unsigned int)(iteration_total * 1000) ); + +#endif + + while (iterations < params->max_iterations){ + +#ifdef __STATISTICS__ + //statistics Here. + iteration_start = Get_Time( ); +#endif + //negative gradient + alpha = -1.; + cublasCheckError ( cublasDscal( cublasHandle, classes_to_solve * data->cols, &alpha, gradient, 1) ); + + cuda_memset( s, 0, classes_to_solve * data->cols * sizeof(real), ERROR_MEM_SET ); + cuda_memset( s_best, 0, classes_to_solve * data->cols * sizeof(real), ERROR_MEM_SET ); + + cg_iterations = Cublas_CG_multi_optimized( &data->spTrain, data->trainSet, gradient, data->weights, s, s_best, params->lambda, + data->rows, data->cols, classes_to_solve, HXW, + nextDevPtr, nextHostPtr, scratch->pageLckWorkspace, + params->max_cg_iterations, params->cg_tolerance, &rel_residual, &best_rel_residual, + &data->spHessianSample, data->sampledHessianTrainSet, + &data->spSampledHessianTrain, data->hessianSampleSize, params->hx_sampling); + + //compute the relative residual here. + // || H*x - g || / || g || + cublasCheckError( cublasDnrm2( cublasHandle, classes_to_solve * data->cols, gradient, 1, &gxnorm )); + + //change gradient back + alpha = -1.; + cublasCheckError ( cublasDscal( cublasHandle, classes_to_solve * data->cols, &alpha, gradient, 1) ); + alphak = cg_linesearch( s_best, data->weights, 0.5, 1e-6, &data->spTrain, (real *)data->trainSet, (real *)data->trainLabels, + params->lambda, data->rows, data->cols, classes_to_solve, gradient, xx, + nextDevPtr, nextHostPtr, (real *)scratch->pageLckWorkspace); + + alpha = alphak; + cublasCheckError( cublasDaxpy( cublasHandle, classes_to_solve * data->cols, &alpha, s_best, 1, data->weights, 1) ); + + + + if (params->gx_sampling >= 1) { + + if (data->trainSet != NULL && data->testSet != NULL) { + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + sampleDataset(&data->spGradientSample, data->trainSet, data->rows, data->cols, + classes_to_solve, data->sampledGradientTrainSet, data->gradientSampleSize); + } else { + //handle sparse datasets here. + prepareForSampling( &data->spGradientSample, data->sampledGradientTrainLabels, data->trainLabels, + data->rows, data->gradientSampleSize, (int *)nextHostPtr); + convertGradientSampleToCSR( &data->spGradientSample, data->gradientSampleSize, data->cols, nextDevPtr ); + + sampleSparseDataset( &data->spGradientSample, &data->spTrain, + data->rows, data->cols, classes_to_solve, + &data->spSampledGradientTrain, data->gradientSampleSize ); + } + + softmax_multiclass_gx_subsampled(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols, + classes_to_solve, data->weights, params->lambda, + gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace, + &data->spGradientSample, data->sampledGradientTrainSet, &data->spSampledGradientTrain, + data->sampledGradientTrainLabels, data->gradientSampleSize, params->gx_sampling); + + } else { + //update here. + computeHXW( &data->spTrain, data->trainSet, data->rows, data->cols, classes_to_solve, data->weights, HXW, 0 ); + + softmax_multiclass_gx_optimized(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols, + classes_to_solve, data->weights, params->lambda, HXW, + gradient, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace); + } + +#ifdef __STATISTICS__ + iteration_total = Get_Timing_Info( iteration_start ); + simulation_total += iteration_total; + //fprintf( stderr, "Total time per iteration ---- > %f \n", iteration_total ); + + //per iteration statistics here. + test_accuracy = softmax_predict(&data->spTest, data->testSet, host->testLabels, data->weights, data->testSize, + data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL); + train_accuracy = softmax_predict( &data->spTrain, data->trainSet, host->trainLabels, data->weights, host->trainSize, + data->cols, classes_to_solve, nextHostPtr, nextDevPtr, 1, NULL ); + test_function = softmax_multiclass_fx(&data->spTest, data->testSet, data->testLabels, data->testSize, data->cols, + classes_to_solve, data->weights, params->lambda, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); + train_function = softmax_multiclass_fx(&data->spTrain, data->trainSet, data->trainLabels, data->rows, data->cols, + classes_to_solve, data->weights, params->lambda, nextDevPtr, nextHostPtr, scratch->pageLckWorkspace ); + + fprintf( stderr, "%9d \t %e \t %e \t %d \t %3.2f \t %e \t %3.2f \t %e \t %d\n", + iterations+1, gxnorm, rel_residual, cg_iterations, + train_accuracy, train_function, test_accuracy, test_function, (unsigned int)(iteration_total * 1000) ); +#endif + + iterations ++; + if (gxnorm <= params->tolerance) break; + + if (((unsigned int)(simulation_total )) >= ALLOTED_TIME ) { + fprintf( stderr, "Exceeded the Time limitation for the simulation..... %d, %d \n", ((unsigned int)(simulation_total )), ALLOTED_TIME ); + break; + } + } + + if (gxnorm >= params->tolerance) + params->iflag = 1; + + return iterations; +} diff --git a/code/cuda/RC-FINAL-5/newton_cg.h b/code/cuda/RC-FINAL-5/newton_cg.h new file mode 100644 index 0000000..d3af91f --- /dev/null +++ b/code/cuda/RC-FINAL-5/newton_cg.h @@ -0,0 +1,25 @@ +#ifndef __H_NEWTON_CG__ +#define __H_NEWTON_CG__ + +#include "cuda_types.h" +#include "dataset.h" + +typedef struct cg_params{ + int max_iterations; + int max_cg_iterations; + real tolerance; + real cg_tolerance; + real iflag; + real lambda; + + //Subsampling + int gx_sampling; + int hx_sampling; + +} NEWTON_CG_PARAMS; + +int newton_cg( ForestDataset *, DeviceDataset *, NEWTON_CG_PARAMS *, SCRATCH_AREA *); +int newton_cg_multi_optimized( ForestDataset *host, DeviceDataset *data, NEWTON_CG_PARAMS *params, SCRATCH_AREA *scratch ); + + +#endif diff --git a/code/cuda/RC-FINAL-5/notes.txt b/code/cuda/RC-FINAL-5/notes.txt new file mode 100644 index 0000000..5932e09 --- /dev/null +++ b/code/cuda/RC-FINAL-5/notes.txt @@ -0,0 +1,6 @@ +This is branched from rc-beta. +rc-beta is brought in from newton.cs.purdue.edu machine. +rc-beta is the first release version which fixed issues with subsampling bug. +rc-beta and rc-alpha suffers from the same problem for mushrooms dataset, where +for subsampling/nonuniform sampling for 100 iterations, the cost function actually +increases. This needs to be looked into. MATLAB version just works fine. diff --git a/code/cuda/RC-FINAL-5/print_utils.c b/code/cuda/RC-FINAL-5/print_utils.c new file mode 100644 index 0000000..19c093b --- /dev/null +++ b/code/cuda/RC-FINAL-5/print_utils.c @@ -0,0 +1,206 @@ +#include "print_utils.h" +#include "cuda_utils.h" + +#include "string.h" + +real computeWeightSum( real *src, int len){ + real *t = (real *)dscratch; + copy_host_device( t, src, len * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG); + real s = 0; + + for (int i=0 ; i < len; i ++) s += t[i]; + return s; +} + +void printVector( real *src, int c, real *r){ + real *t = (real *)dscratch; + int count = c;// > 20 ? 20 : c; + copy_host_device( t, src, c * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG); + + for (int i = 0; i < count; i ++){ + if ((i % 20 == 0) && (i != 0)) fprintf (stderr, "\n"); + fprintf( stderr, " %e ", t[i] ); + } + fprintf (stderr, "\n"); +} + +void printCustomVector( real *src, int c, int jump){ + real *t = (real *)dscratch; + int count = c; + copy_host_device( t, src, c * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG); + + for (int i = 0; i < count; i += jump){ + fprintf( stderr, " %f ", t[i] ); + } + fprintf (stderr, "\n"); +} + +void printIntVector( int *src, int c, int *r){ + int *t = (int *)dscratch; + int count = c;// > 20 ? 20 : c; + copy_host_device( t, src, c * sizeof(int), cudaMemcpyDeviceToHost, ERROR_DEBUG); + + for (int i = 0; i < count; i ++){ + if ((i % 20 == 0) && (i != 0)) fprintf (stderr, "\n"); + fprintf( stderr, " %d ", t[i] ); + } + fprintf (stderr, "\n"); +} + +void printHostVector( real *src, int c ){ + real *t = src; + int count = c;// > 20 ? 20 : c; + + for (int i = 0; i < count; i ++){ + if ((i % 20 == 0) && (i != 0)) fprintf (stderr, "\n"); + fprintf( stderr, " %e ", t[i] ); + } + fprintf (stderr, "\n"); +} + +void writeMatrix ( real *mat, int rows ) +{ + FILE *dataset_file; + real *t = (real *) dscratch; + + if ( (dataset_file = fopen("./hessian.txt", "w")) == NULL ) { + fprintf( stderr, "Error opening the hessian.... !\n" ); + exit( -1 ); + } + + fprintf (stderr, "Copying data to host \n"); + copy_host_device( t, mat, rows * rows * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG); + fprintf (stderr, "Done Copying data to host \n"); + + for (int i = 0; i < rows; i ++){ + fprintf (dataset_file, "%6.2f", t[ i * rows ] ); + for (int j = 1; j < rows; j ++){ + fprintf( dataset_file, ",%6.2f", t[ i * rows + j ] ); + } + fprintf( dataset_file, "\n"); + } + fclose (dataset_file); +} + +void writeSparseMatrix (real *dataPtr, int *rowIndex, int *colIndex, int m, int n, int nnz ) +{ + FILE *dataset_file; + int *t = (int *) dscratch; + real *t1 = (real *) dscratch; + + if ( (dataset_file = fopen("./rowindex.txt", "w")) == NULL ) { + fprintf( stderr, "Error opening the hessian.... !\n" ); + exit( -1 ); + } + + fprintf (stderr, "Copying data to host \n"); + copy_host_device( t, rowIndex, sizeof(int) * (m + 1), cudaMemcpyDeviceToHost, ERROR_DEBUG); + fprintf (stderr, "Done Copying data to host \n"); + + for (int i = 0; i < m + 1; i ++){ + fprintf( dataset_file, "%d\n", t[ i ] ); + } + fclose (dataset_file); + + if ( (dataset_file = fopen("./colindex.txt", "w")) == NULL ) { + fprintf( stderr, "Error opening the hessian.... !\n" ); + exit( -1 ); + } + + fprintf (stderr, "Copying data to host \n"); + copy_host_device( t, colIndex, sizeof(int) * (nnz), cudaMemcpyDeviceToHost, ERROR_DEBUG); + fprintf (stderr, "Done Copying data to host \n"); + + for (int i = 0; i < nnz; i ++){ + fprintf( dataset_file, "%d\n", t[ i ] ); + } + fclose (dataset_file); + + if ( (dataset_file = fopen("./data.txt", "w")) == NULL ) { + fprintf( stderr, "Error opening the hessian.... !\n" ); + exit( -1 ); + } + + fprintf (stderr, "Copying data to host \n"); + copy_host_device( t1, dataPtr, sizeof(real) * (nnz), cudaMemcpyDeviceToHost, ERROR_DEBUG); + fprintf (stderr, "Done Copying data to host \n"); + + for (int i = 0; i < nnz; i ++){ + fprintf( dataset_file, "%6.10f\n", t1[ i ] ); + } + fclose (dataset_file); +} + +void writeVector ( real *mat, int rows, char *file, int hostData ) +{ + FILE *dataset_file; + real *t = (real *) dscratch; + + if ( (dataset_file = fopen( file, "w")) == NULL ) { + fprintf( stderr, "Error opening the path .... !\n" ); + exit( -1 ); + } + + if (hostData == 1) { + t = mat; + } else { + fprintf (stderr, "Copying data to host \n"); + copy_host_device( t, mat, rows * sizeof(real), cudaMemcpyDeviceToHost, ERROR_DEBUG); + fprintf (stderr, "Done Copying data to host \n"); + } + + for (int i = 0; i < rows; i ++){ + fprintf( dataset_file, "%e\n", t[ i ] ); + } + //fprintf( dataset_file, "\n"); + fclose (dataset_file); +} + +int readVector( real *vec, int rows, char *file, int offset ){ + FILE *handle; + char line[1024]; + int index = 0; + char *word; + + if ( (handle= fopen( file, "r" )) == NULL ) { + fprintf( stderr, "Error opening the path... \n"); + exit(-1); + } + + index = 0; + while (!feof( handle )){ + memset( line, 0, 1024); + fgets( line, 1024, handle); + if (line[0] == 0) break; + + word = strtok( line, "\n"); + vec[ index ++ ] = atof( word ) + offset; + + if (index >= rows) break; + } + fclose( handle ); + + return index; +} + +void writeIntVector ( int *mat, int rows ) +{ + FILE *dataset_file; + int *t = (int *) dscratch; + + if ( (dataset_file = fopen( "./vector.txt", "w")) == NULL ) { + fprintf( stderr, "Error opening the path .... !\n" ); + exit( -1 ); + } + + fprintf (stderr, "Copying data to host \n"); + copy_host_device( t, mat, rows * sizeof(int), cudaMemcpyDeviceToHost, ERROR_DEBUG); + fprintf (stderr, "Done Copying data to host \n"); + + for (int i = 0; i < rows; i ++){ + fprintf( dataset_file, "%d\n", t[ i ] ); + } + //fprintf( dataset_file, "\n"); + fclose (dataset_file); +} + diff --git a/code/cuda/RC-FINAL-5/print_utils.h b/code/cuda/RC-FINAL-5/print_utils.h new file mode 100644 index 0000000..8080f86 --- /dev/null +++ b/code/cuda/RC-FINAL-5/print_utils.h @@ -0,0 +1,21 @@ +#ifndef __H_PRINT_UTILS__ +#define __H_PRINT_UTILS__ + +#include "cuda_types.h" + +void printVector( real *src, int s, real *t ); +void printCustomVector( real *src, int s, int jump ); +void printIntVector( int *src, int s, int *t ); +void printHostVector( real *src, int s ); +void writeMatrix (real *mat, int c); +void writeVector (real *mat, int c, char *file, int ); +void writeIntVector (int *mat, int c ); +void writeSparseMatrix (real *dataPtr, int *rowIndex, int *colIndex, int m, int n, int nnz ); + +real computeWeightSum( real *weights, int len ); + +int readVector( real *vec, int rows, char *file, int offset ); + + + +#endif diff --git a/code/cuda/RC-FINAL-5/readMatVec.cc b/code/cuda/RC-FINAL-5/readMatVec.cc new file mode 100644 index 0000000..575cb78 --- /dev/null +++ b/code/cuda/RC-FINAL-5/readMatVec.cc @@ -0,0 +1,130 @@ +#ifndef __H_MATVEC__ +#define __H_MATVEC__ + +#include "readMatVec.h" +#include + +#define MAX_LINE 1024 + +void readMatVec( char *matrixPath, char *vectorPath, double **matrix, double **vector, int *N){ + + //read the CSV file here and create + //matrix and vector files and pass + //them back to the main file + FILE *matFile; + char line[MAX_LINE]; + int numLines = 0; + int index = 0; + double *fileMatrix; + double *fileVector; + + if ( (matFile = fopen(matrixPath, "r")) == NULL ) { + fprintf( stderr, "Error opening the pdb file!\n" ); + exit( -1 ); + } + + while (!feof( matFile ) ){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, matFile ); + if (line[0] == 0) break; + numLines ++; + } + fprintf( stderr, " Number of lines read: %d \n", numLines ); + + *N = numLines; + fileMatrix = (double *) malloc( sizeof(double) * (numLines) * (numLines) ); + + //read the file here and fill the matrix. + rewind( matFile ); + while (!feof( matFile )){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, matFile); + if (line[0] == 0) break; + tokenize( line, fileMatrix, &index ); + } + + fclose( matFile ); + fprintf( stderr, "Number of elements: %d\n", index ); + + //read teh vector here. + fileVector = (double *) malloc (sizeof(double) * (numLines) ); + if ( (matFile = fopen(vectorPath, "r")) == NULL ) { + fprintf( stderr, "Error opening the pdb file!\n" ); + exit( -1 ); + } + + index = 0; + while (!feof( matFile )){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, matFile); + if (line[0] == 0) break; + fileVector[index ++] = atof( line ); + //fprintf (stderr, "%s --> %f\n", line, atof(line) ); + } + fprintf( stderr, "------------------\n"); + fclose( matFile ); + fprintf( stderr, "Number of elements: %d\n", index ); + + *matrix = fileMatrix; + *vector = fileVector; +} + +void tokenize( char *line, double *matrix, int* index){ + char *sep = ", \n"; + char *word; + char temp[MAX_LINE]; + + strncpy( temp, line, MAX_LINE ); + for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) + matrix[ (*index) ++ ] = atof( word ); +} + +void tokenize_count( char *line, int* index){ + char *sep = ", \n"; + char *word; + char temp[MAX_LINE]; + + strncpy( temp, line, MAX_LINE ); + for( word = strtok(temp, sep); word; word = strtok(NULL, sep) ) + (*index) ++; +} + +void readVec( char *vectorPath, double **vector, int *N){ + + FILE *matFile; + char line[MAX_LINE]; + int numLines = 0; + int index = 0; + double *fileVector; + + if ( (matFile = fopen(vectorPath, "r")) == NULL ) { + fprintf( stderr, "Error opening the pdb file!\n" ); + exit( -1 ); + } + + while (!feof( matFile ) ){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, matFile ); + if (line[0] == 0) break; + tokenize_count( line, &numLines ); + break; + } + fprintf( stderr, " Number of lines read: %d \n", numLines ); + + *N = numLines; + fileMatrix = (double *) malloc( sizeof(double) * (numLines) ); + + //read the file here and fill the matrix. + rewind( matFile ); + while (!feof( matFile )){ + memset( line, 0, MAX_LINE ); + fgets( line, MAX_LINE, matFile); + if (line[0] == 0) break; + tokenize( line, fileMatrix, &index ); + break; + } + + fclose( matFile ); + fprintf( stderr, "Number of elements: %d\n", index ); +} +#endif diff --git a/code/cuda/RC-FINAL-5/readMatVec.h b/code/cuda/RC-FINAL-5/readMatVec.h new file mode 100644 index 0000000..b299c33 --- /dev/null +++ b/code/cuda/RC-FINAL-5/readMatVec.h @@ -0,0 +1,13 @@ +#ifndef __H_READ_MATRIX__ +#define __H_READ_MATRIX__ + +#include +#include +#include "ctype.h" + +void tokenize( char *, double *, int* ); +void readMatVec( char *, char *, double **, double **, int *); +void readVec( char *, double **, int *); + + +#endif diff --git a/code/cuda/RC-FINAL-5/softmax_multiclass.cu b/code/cuda/RC-FINAL-5/softmax_multiclass.cu new file mode 100644 index 0000000..b588c30 --- /dev/null +++ b/code/cuda/RC-FINAL-5/softmax_multiclass.cu @@ -0,0 +1,2208 @@ +#include "softmax_multiclass.h" +#include "cuda_utils.h" + +#include "gen_random.h" +#include "cuda_types.h" +#include "print_utils.h" + +#include "classification_kernels.h" + +GLOBAL void ker_exp( real *results, int count) +{ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < count) + results[idx] = exp( (real)idx ); +} + +void expTest( real *results, int count, real *host){ + + ker_exp <<< 1, count>>> (results, count); + cudaThreadSynchronize (); + cudaCheckError (); +} + +__device__ __inline__ double my_shfl(double x, int lane) +{ + // Split the double number into 2 32b registers. + int lo, hi; + asm volatile( "mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(x)); + + // Shuffle the two 32b registers. + lo = __shfl_xor(lo, lane); + hi = __shfl_xor(hi, lane); + + // Recreate the 64b number. + //asm volatile( "mov.b64 %0, {%1,%2};" : "=d(x)" : "r"(lo), "r"(hi)); + //return x; + return __hiloint2double( hi, lo); +} + +__device__ __inline__ double warpSum( double x ) +{ + for (int offset = WARP_SIZE/2; offset > 0; offset /= 2) + x += my_shfl( x, offset); + return x; +} + + +GLOBAL void ker_add_regularizer ( real *input, real *vector, real lambda, int count, real normalizer) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) input[ idx ] += lambda * vector[ idx ] ; +} + + +/* +GLOBAL void reduce(const real *input, real *results, const size_t count) { + extern __shared__ real my_results[]; + unsigned int lane = threadIdx.x >> 5; + unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; + + real sdata; + real x = 0; + + sdata = 0; + my_results[ lane ] = 0; + if(idx < count) x = input [idx]; + sdata = x; + + sdata = warpSum ( sdata ); + if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0; + else + sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0; + __syncthreads (); + + if (lane == 0) sdata = warpSum( sdata ); + if(threadIdx.x == 0) results [ blockIdx.x ] = sdata; +} + +*/ + +GLOBAL void reduce_vector_warp( const real *input, const real *maxdots, real *results, const size_t numcomps, int numblocks ) +{ + extern __shared__ real my_results[]; + + unsigned int lane = threadIdx.x >> 5; + unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; + + real sdata; + sdata = 0.; + + if (idx < numcomps ){ + for (int c = 0; c < numblocks; c ++) sdata += input [ c * numcomps + idx ]; + results[ idx ] = sdata + exp( -1. * maxdots[ idx ] ); + } +} + + +GLOBAL void reduce_vector_warp_mt( const real *input, const real *maxdots, real *results, const size_t numcomps, int numblocks ) +{ + unsigned int col = threadIdx.x >> 5; + unsigned int myRowId = (blockDim.x * blockIdx.x + threadIdx.x) / WARP_SIZE; + + real sdata; + real x = 0.; + + sdata = 0.; + x = 0.0; + if ((col < numblocks) && (myRowId < numcomps)) x = input[(col * numcomps) + myRowId ]; + sdata = x; + __syncthreads (); + + sdata = warpSum ( sdata ); + if ((col == 0) && (myRowId < numcomps)) + results [ myRowId ] = sdata + exp( -1 * maxdots[myRowId] ); +} + + +GLOBAL void reduce_vector_mt( const real *input, real *results, const size_t numcomps, const real normalizer, int numblocks ) +{ + extern __shared__ real my_results[]; + + unsigned int idx = threadIdx.x; + unsigned int lane = threadIdx.x >> 5; + unsigned int compOffset = blockIdx.x; + + real sdata; + real x = 0.; + + for (int i = compOffset; i < numcomps; i += gridDim.x){ + + sdata = 0.; + my_results[ lane ] = 0.; + x = 0.0; + if ((idx < numblocks) && (i < numcomps)) x = input[(idx * numcomps) + i ]; + sdata = x; + __syncthreads (); + + sdata = warpSum ( sdata ); + if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0; + else + sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0; + __syncthreads (); + + if (lane == 0) sdata = warpSum( sdata ); + if((threadIdx.x == 0) && (i < numcomps)) results [ i ] = sdata * normalizer; + __syncthreads (); + } +} + +GLOBAL void reduce_vector(const real *input, real *results, const size_t numclasses, const size_t cols, const real normalizer, int numblocks) +{ + extern __shared__ real my_results[]; + unsigned int lane = threadIdx.x >> 5; + unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; + + real sdata; + real x = 0.; + + for (int i = 0; i < numclasses * cols; i ++){ + sdata = 0.; + my_results[ lane ] = 0.; + x = 0.0; + if (idx < numblocks) x = input[idx * numclasses * cols + i]; + sdata = x; + __syncthreads (); + + sdata = warpSum ( sdata ); + if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0; + else + sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0; + __syncthreads (); + + if (lane == 0) sdata = warpSum( sdata ); + if(threadIdx.x == 0) results [ i ] = sdata * normalizer; + __syncthreads (); + } +} + +GLOBAL void reduce_log(const real *input, real *results, const size_t count) { + extern __shared__ real my_results[]; + unsigned int lane = threadIdx.x >> 5; + unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; + + real sdata; + real x = 0; + + sdata = 0; + my_results[ lane ] = 0; + if(idx < count) x = log(input [idx] ); + sdata = x; + + sdata = warpSum ( sdata ); + if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0; + else + sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0; + __syncthreads (); + + if (lane == 0) sdata = warpSum( sdata ); + if(threadIdx.x == 0) results [ blockIdx.x ] = sdata; +} + +GLOBAL void ker_compute_expsum( real *XW, int rows, int cols, int numclasses, + real *expSumVec, int threads_per_col) +{ + int myColId = ( blockIdx.x * blockDim.x + threadIdx.x ) % threads_per_col; + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ) / threads_per_col; + + //local Data. + real sdata = 0; + + for (int i = myRowId; i < rows; i += gridDim.x * blockDim.x ) + { + sdata = 0; + + for (int j = myColId; j < cols; j ++ ) sdata += exp ( XW[ j * rows + i ] ); + + //warp sum here. + for (int offset = threads_per_col/2; offset > 0; offset /= 2) + sdata += my_shfl( sdata, offset); + + if (myColId == 0) expSumVec[ i ] = sdata; + } +} + +/* + +GLOBAL void ker_init_scaleTerms ( real *scaleTerms, int sampleSize, real *probs, int *indices ) +{ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + if (myRowId < sampleSize){ + scaleTerms[ myRowId ] = probs[ indices[ myRowId ] ] ; + } +} + + +GLOBAL void ker_compute_probs( real *probs, int rows, int sampleSize, real *randVec, real *indices) +{ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + if (myRowId < rows ){ + probs[ myRowId ] *= sampleSize; + if (probs[ myRowId ] > 1.0) probs[ myRowId ] = 1.0; + + if (randVec[ myRowId ] < probs[ myRowId ] ) + indices[ myRowId ] = 1; + else + indices[ myRowId ] = 0; + } +} + +*/ + +GLOBAL void ker_compute_dHXW_nrm (real *dHXW, real *rowNrms, int rows, int numclasses) +{ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + + if (myRowId < rows) + { + for (int j = 0; j < numclasses; j += 1 ){ + dHXW[ j * rows + myRowId ] = abs( dHXW[ j * rows + myRowId ] * (1. - dHXW[ j * rows + myRowId ]) ) * rowNrms[ myRowId ]; + } + for (int j = 1; j < numclasses; j += 1 ){ + dHXW[ myRowId ] += dHXW[ j * rows + myRowId ]; + } + } +} + +/* + +GLOBAL void ker_normalize (real *dHXW, int rows, real *nrmConstant, real *probs ){ + int myRowId = blockIdx.x * blockDim.x + threadIdx.x; + if (myRowId < rows){ + probs[ myRowId ] = dHXW[ myRowId ] / nrmConstant[0]; + } +} + +GLOBAL void ker_row_norms( real *features, int rows, int numclasses, real *nrm ) +{ + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ); + int i = 0; + real sum = 0; + + if (myRowId < rows) { + i = myRowId; + for (int j = 0; j < numclasses; j += 1) + sum += pow( features[ j * rows + i ], 2.); + + nrm[ i ] = sqrt( sum ); + } +} + + +GLOBAL void ker_sqr_elements ( real *ptr, int nnz, int elems_per_thread, real *results ) +{ + int myID = blockIdx.x * blockDim.x + threadIdx.x ; + int i = 0; + + if (myID < nnz) { + i = myID; + //results[ i ] = ptr[ i ] * ptr[ i ]; + ptr[ i ] *= ptr[ i ]; + } + +} + +GLOBAL void ker_sqrt_elements (real *ptr, int count ) +{ + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ); + int i = 0; + + if (myRowId < count ){ + i = myRowId; + ptr[ i ] = sqrt( ptr[ i ] ); + } +} + +GLOBAL void ker_init_ones (real *ptr, int count ) +{ + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ); + int i = 0; + + if (myRowId < count ){ + i = myRowId; + ptr[ i ] = 1.0; + } +} + +*/ + + +GLOBAL void ker_compute_HXW( real *XW, int rows, int cols, int numclasses, int threads_per_col ) +{ + int myColId = ( blockIdx.x * blockDim.x + threadIdx.x ) % threads_per_col; + int myRowId = ( blockIdx.x * blockDim.x + threadIdx.x ) / threads_per_col; + int myWarpId = (blockIdx.x * blockDim.x + threadIdx.x ) % WARP_SIZE; + + real sdata = 0; + int i = 0; + + real maxdot = 0; + + //for (int i = myRowId; i < rows; i += gridDim.x * blockDim.x){ + if (myRowId < rows) { + i = myRowId; + + maxdot = 0; + for (int j = 0; j < numclasses; j += threads_per_col ) { + if (maxdot < XW[ j * rows + i ]) maxdot = XW[ j * rows + i ]; + } + + sdata = 0; + for (int j = 0; j < numclasses; j += threads_per_col ) sdata += exp ( XW[ j * rows + i ] - maxdot ); + + //for (int offset = threads_per_col/2; offset > 0; offset /= 2) sdata += my_shfl( sdata, myWarpId + offset ); + + for (int j = 0; j < numclasses; j += threads_per_col ) + XW[ j * rows + i ] = exp( XW[ j * rows + i ] - maxdot ) / (exp(-1. * maxdot) + sdata); + } +} + + +GLOBAL void ker_compute_fx (real *matvec, int rows, int cols, int numclasses, + real *target, real *indicatorVal, int NUM_THREADS, real *maxdots ) +{ + extern __shared__ real my_results[]; + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int myClrId = idx % NUM_THREADS; + int myRowId = idx / NUM_THREADS; + unsigned int lane = threadIdx.x >> 5; + + real sdata = 0; + real maxdot = 0; + + //if (myRowId < rows) { + for (int r = myRowId; r < rows; r += gridDim.x * blockDim.x ) { + maxdot = 0; + for (int i = myClrId; i < numclasses; i += NUM_THREADS){ + if (maxdot < matvec[ i * rows + r ]) maxdot = matvec[ i * rows + r]; + } + + maxdots[ r ] = maxdot; + + for (int i = myClrId; i < numclasses; i += NUM_THREADS){ + if ((int)target[ r ] == (i + 1)) sdata += matvec[ i * rows + r ]; + matvec[ i * rows + r ] = exp( matvec[ i * rows + r ] - maxdot); + } + } + __syncthreads (); + + sdata = warpSum ( sdata ); + if (threadIdx.x % WARP_SIZE == 0) my_results[lane] = sdata ; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + sdata = (threadIdx.x < 1) ? my_results[threadIdx.x] : 0; + else + sdata = (threadIdx.x < (blockDim.x/WARP_SIZE)) ? my_results[threadIdx.x] : 0; + __syncthreads (); + + if (lane == 0) sdata = warpSum( sdata ); + if(threadIdx.x == 0) indicatorVal [ blockIdx.x ] = sdata; +} + +GLOBAL void ker_softmax (real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *wspace ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int lane = threadIdx.x >> 5; + + extern __shared__ real sh_vec[]; + real dot = 0; + int myclass = 0; + + real blk_sum = 0; + real psum = 0; + + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x]; + __syncthreads (); + + dot = 0; + if (idx < rows ) { + for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; + psum += exp (dot); + } + __syncthreads (); + } + + // subtract the weights * feature for the class it belongs. + if (idx < rows){ + psum = log( 1 + psum ); + myclass = (int)(target[ idx ] - 1); + } + + if ( idx < rows && myclass < num_classes) { + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + idx ] * weights[ myclass * cols + j ]; + psum = psum - dot; + } + __syncthreads (); + + // block reduction here. + blk_sum = warpSum( psum ); + if (threadIdx.x % WARP_SIZE == 0) sh_vec[lane] = blk_sum; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + blk_sum = (threadIdx.x < 1) ? sh_vec[threadIdx.x] : 0; + else + blk_sum = (threadIdx.x < (blockDim.x / WARP_SIZE) ) ? sh_vec[ threadIdx.x ] : 0; + __syncthreads (); + + if (lane == 0) blk_sum = warpSum( blk_sum ); + if (threadIdx.x == 0) wspace[ blockIdx.x ] = blk_sum; +} + +GLOBAL void ker_dx_softmax (real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *wspace ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int lane = threadIdx.x >> 5; + extern __shared__ real sh_vec[]; + + real numerator = 0.; + real denominator = 0.; + int indicator = 0; + real multiplier = 0.; + real blk_sum = 0.; + real p_i = 0.; + + real maxdot = 0.; + + if (idx < rows) indicator = (int)(target[ idx ] - 1.); + __syncthreads (); + + //maxdot here. + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x ]; + __syncthreads (); + + numerator = 0.; + if (idx < rows) { + for (int j = 0; j < cols; j ++) + numerator += sh_vec[j] * features[ j * rows + idx ]; + + if (maxdot < numerator) maxdot = numerator; + } + __syncthreads (); + } + + + //denominator here. + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x ]; + __syncthreads (); + + numerator = 0.; + if (idx < rows) { + for (int j = 0; j < cols; j ++) + numerator += sh_vec[j] * features[ j * rows + idx ]; + denominator += exp( numerator - maxdot ); + } + __syncthreads (); + } + + //numerator here. + //dw_i (j) here. + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x]; + __syncthreads (); + + numerator = 0; + if ( idx < rows ){ + for (int j = 0; j < cols; j ++) + numerator += sh_vec[j] * features[ j * rows + idx ]; + numerator = exp( numerator - maxdot ); + //p_i = numerator / (1 + denominator); + p_i = numerator / (exp(1. * maxdot) + denominator); + + if (i == indicator) multiplier = 1.0; + else multiplier = 0.; + } + __syncthreads (); + + for (int j = 0; j < cols; j ++){ + blk_sum = 0.; + if (idx < rows) + blk_sum = (p_i - multiplier) * features[ j * rows + idx ]; + + __syncthreads (); + + // block level reduction here. + blk_sum = warpSum( blk_sum); + if (threadIdx.x % WARP_SIZE == 0) sh_vec[lane] = blk_sum; + __syncthreads (); + + if (blockDim.x/WARP_SIZE == 0) + blk_sum = (threadIdx.x < 1) ? sh_vec[threadIdx.x] : 0; + else + blk_sum = (threadIdx.x < (blockDim.x / WARP_SIZE) ) ? sh_vec[ threadIdx.x ] : 0; + __syncthreads (); + + if (lane == 0) blk_sum = warpSum( blk_sum ); + if (threadIdx.x == 0) wspace[ (blockIdx.x * num_classes * cols) + ( i * cols + j ) ] = blk_sum; + __syncthreads (); + } + } +} + +GLOBAL void ker_dx_softmax_mt (real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *XW, real *expSum, real *wspace, int threads_per_col) +{ + extern __shared__ real shmem[]; + + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int myIdx = idx / threads_per_col; + + real indicator = 0; + real class_prob; + + for (int clr = 0; clr < num_classes; clr ++){ + + for (int col = myIdx; col < cols; col += threads_per_col){ + + shmem[ myIdx ] = 0; + + for (int r = 0; r < rows; r += gridDim.x * blockDim.x ) { + class_prob = XW[ clr * rows + r ] / expSum[ r ]; + if (clr == (target[ r ] - 1)) indicator = 1. ; + shmem[ myIdx ] += (class_prob - indicator) * features[ col * rows + r ]; + } + + wspace[ blockIdx.x * num_classes * cols + clr * cols + col ] = shmem[ myIdx ]; + } + } +} + +GLOBAL void ker_dx_softmax_ind( real *hxw, real *target, int rows, int num_classes, real *result, int threads_per_row) +{ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int myClrId = idx % threads_per_row; + int myRowId = idx / threads_per_row; + + real indicator = 0; + int r = 0; + + //for (int r = idx; r < rows; r += gridDim.x * blockDim.x){ + + if (idx < rows ) { + r = idx; + for (int clr = 0; clr < num_classes; clr ++ ){ + result[ clr * rows + r ] = hxw[ clr * rows + r ]; + if (clr == (int)(target[ r ] - 1.)) result[ clr * rows + r ] -= 1.; + + //result[ clr * rows + r ] = 0; + //if (clr == (int)(target[ r ] - 1.)) result[ clr * rows + r ] = 1; + } + } +} + +////Hessian functions here. +GLOBAL void ker_hx_Xv ( real *features, real *vector, int rows, int cols, int num_classes, real *A ) { + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ real sh_vec[]; + + real dot = 0; + + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = vector [i * cols + threadIdx.x ]; + __syncthreads (); + + if (idx < rows) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; + A[ idx + i * rows ] = dot; // column major format here. + } + __syncthreads (); + } +} + +GLOBAL void ker_hx_ProbabilityTerms ( real *features, real *weights, int rows, int cols, int num_classes, real *B ) { + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ real sh_vec[]; + + real dot = 0; + real sumexp = 0; + + //probability terms here. + sumexp = 0; + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x]; + __syncthreads (); + + if ( idx < rows ){ + dot = 0; + for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; + sumexp += exp( dot ); + } + __syncthreads (); + } + + for (int i = 0; i < num_classes; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x]; + __syncthreads (); + + if ( idx < rows ){ + dot = 0; + for (int j = 0; j < cols; j ++) dot += sh_vec[j] * features[ j * rows + idx ]; + B [ idx + i * rows ] = exp(dot) / (1 + sumexp); + } + __syncthreads (); + } +} + +GLOBAL void ker_hx_C_scale (real *A, real *B, real *C, int rows, int cols, int num_classes, real *scale ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + real sum = 0; + if (idx < rows){ + for (int i = 0; i < num_classes; i ++) + sum += A[ idx + i * rows ] * B[ idx + i * rows ]; + + for (int i = 0; i < num_classes; i ++) + C[ i * rows + idx ] = + (1. / scale[ idx ]) * ( A[ idx + i * rows ] * B[ idx + i * rows ] - + B[ idx + i * rows ] * sum ); + } +} + +GLOBAL void ker_hx_C (real *A, real *B, real *C, int rows, int cols, int num_classes ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + real sum = 0; + if (idx < rows){ + for (int i = 0; i < num_classes; i ++) + sum += A[ idx + i * rows ] * B[ idx + i * rows ]; + + for (int i = 0; i < num_classes; i ++) + C[ i * rows + idx ] = + A[ idx + i * rows ] * B[ idx + i * rows ] - + B[ idx + i * rows ] * sum ; + } +} + + +real softmax_multiclass_fx (SparseDataset *spfeatures, real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *devPtr, real *hostPtr, real *pageLckPtr){ + +/* + ker_softmax <<< BLOCKS, BLOCK_SIZE, sizeof(real) * cols >>> + (features, target, rows, cols, num_classes, weights, lambda, devPtr); + cudaThreadSynchronize (); + cudaCheckError (); + + reduce <<< 1, BLOCKS_POW_2, BLOCKS_POW_2 * sizeof(real) >>> + ( devPtr, pageLckPtr, BLOCKS ); + cudaThreadSynchronize (); + cudaCheckError (); + + cublasCheckError( cublasDnrm2( cublasHandle, num_classes * cols, weights, 1, &pageLckPtr[1])) ; + return (pageLckPtr[0]) + (lambda/2.0) * pow(pageLckPtr[1], 2.); +*/ + + + //matvec operation here. + int power = 1; + real alpha; + real beta; + real *indicatorVal = devPtr + rows * num_classes; + real *maxdots = indicatorVal + rows + BLOCKS_POW_2; + real *alphax = maxdots + rows + BLOCKS_POW_2; + int NUM_THREADS = 1; + + alpha = 1.0; + beta = 0; + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + rows, num_classes, cols, + &alpha, features, rows, + weights, cols, &beta, devPtr, rows) ); + } else { + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, num_classes, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, + spfeatures->colPtr, weights, cols, &beta, devPtr, rows ) + ); + } + //fprintf( stderr, "NUM CLASSES --- >%d \n", num_classes ); + //fprintf( stderr, "Matvec: \n"); + //printVector( devPtr, 20, NULL); + + ker_compute_fx <<< BLOCKS * NUM_THREADS, BLOCK_SIZE, WARP_SIZE * sizeof(real) >>> + ( devPtr, rows, cols, num_classes, target, indicatorVal, NUM_THREADS, maxdots); + cudaThreadSynchronize (); + cudaCheckError (); + //fprintf( stderr, "Exp matvec: ... \n"); + //printVector( devPtr, 20, NULL); + //printVector( maxdots, 20, NULL); + + + //reduce the maxdots here. + reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE * sizeof (real) >>> + (maxdots, maxdots + rows, rows ); + cudaThreadSynchronize (); + cudaCheckError (); + //printVector (maxdots + rows, 20, NULL ); + + reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof( real ) >>> + //reduce <<< 1, WARP_SIZE, WARP_SIZE * sizeof( real ) >>> + (maxdots + rows, &pageLckPtr[3], BLOCKS ); + cudaThreadSynchronize (); + cudaCheckError (); + //fprintf( stderr, "Maxdot sum: ... %e \n", pageLckPtr[3]); + + + // final value of the indicator + reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> + //reduce <<< 1, WARP_SIZE, WARP_SIZE * sizeof(real) >>> + ( indicatorVal, &pageLckPtr[0], BLOCKS ); + cudaThreadSynchronize (); + cudaCheckError (); + + //fprintf( stderr, "Indicator value: %e \n", pageLckPtr[0] ); + + /* + power = 1; + while (power < num_classes) power *= 2; + + //compute the log par there. + reduce_vector_mt <<< THREADS_PER_ROW, WARP_SIZE, WARP_SIZE * sizeof(real) >>> + (devPtr, devPtr, rows, 1., num_classes); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + //compute the log part here. + int warp_blocks = ((rows * WARP_SIZE) / BLOCK_SIZE) + + (((rows * WARP_SIZE) % BLOCK_SIZE == 0) ? 0 : 1); + + //reduce_vector_warp_mt <<< warp_blocks, BLOCK_SIZE >>> + reduce_vector_warp <<< BLOCKS, BLOCK_SIZE >>> + (devPtr, maxdots, alphax, rows, num_classes ); + cudaThreadSynchronize (); + cudaCheckError (); + //fprintf( stderr, " Reduce Warp: ....\n"); + //printVector( alphax, 20, NULL); + + + //final log part here. + reduce_log <<< BLOCKS, BLOCK_SIZE, WARP_SIZE* sizeof(real) >>> + //reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE* sizeof(real) >>> + //( devPtr, devPtr, rows ); + ( alphax, alphax + rows, rows ); + cudaThreadSynchronize (); + cudaCheckError (); + //printVector( alphax + rows, 20, NULL); + + reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> + //reduce <<< 1, WARP_SIZE, WARP_SIZE * sizeof(real) >>> + //( devPtr, &pageLckPtr[1], BLOCKS); + ( alphax + rows, &pageLckPtr[1], BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + //fprintf( stderr, "Log part: %e \n", pageLckPtr[1] ); + + //return pageLckPtr[1]; + + cublasCheckError( cublasDnrm2( cublasHandle, num_classes * cols, weights, 1, &pageLckPtr[2])) ; + return (pageLckPtr[3] + pageLckPtr[1]) - pageLckPtr[0] + (lambda/2.0) * pow(pageLckPtr[2], 2.); + +} + +//the result is a vector in here. +void softmax_multiclass_gx (real *features, real *target, int rows, int cols, + int num_classes, real *weights, real lambda, real *gradient, + real *devPtr, real *hostPtr, real *pageLckPtr) +{ + ker_dx_softmax <<>> + (features, target, rows, cols, num_classes, weights, lambda, devPtr); + cudaThreadSynchronize (); + cudaCheckError (); + + /* + reduce_vector <<<1, BLOCKS_POW_2, (BLOCKS_POW_2/WARP_SIZE) * sizeof (real) >>> + (devPtr, gradient, num_classes, cols, 1., BLOCKS ); + */ + + //int maxcomps = num_classes * cols + (num_classes * cols) % THREADS_PER_ROW ; + int maxcomps = num_classes * cols ; + reduce_vector_mt <<< THREADS_PER_ROW, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> + (devPtr, gradient, maxcomps, 1., BLOCKS ); + cudaThreadSynchronize (); + cudaCheckError (); + + if (lambda != 0) { + pageLckPtr[0] = lambda ; + cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &pageLckPtr[0], weights, 1, gradient, 1) ); + } +} + +// build the hessian here. +void softmax_multiclass_hx (real *features, int rows, int cols, int num_classes, + real *weights, real *vector, real lambda, + real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B, int computeB) +{ + /* + real *A = devPtr; + real *B = A + rows * num_classes; + real *C = B + rows * num_classes; + */ + real *A = devPtr; + real *C = A + rows * num_classes; + + real *alpha = pageLckPtr; + real *beta = alpha + 1; + + //compute A here. + ker_hx_Xv <<< BLOCKS, BLOCK_SIZE, cols * sizeof(real) >>> + (features, vector, rows, cols, num_classes, A); + cudaThreadSynchronize (); + cudaCheckError (); + + //Compute B Here. + if (computeB >= 1) { + ker_hx_ProbabilityTerms <<>> + (features, weights, rows, cols, num_classes, B); + cudaThreadSynchronize (); + cudaCheckError (); + } + + + //Compute C Here. + ker_hx_C <<< BLOCKS, BLOCK_SIZE >>> + (A, B, C, rows, cols, num_classes); + cudaThreadSynchronize (); + cudaCheckError (); + + //Compute the final Matvec Here. + *alpha = 1.0; + *beta = 0; + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, + cols, num_classes, rows, + alpha, features, rows, + C, rows, beta, Hv, cols ) ); + +/* + + *alpha = 1./(real)(num_classes * rows); + cublasCheckError (cublasDscal( cublasHandle, num_classes * cols, alpha, Hv, 1) ); +*/ + + + if (lambda != 0) { + int rblocks = ((num_classes * cols) / BLOCK_SIZE) + + (((num_classes * cols) % BLOCK_SIZE == 0) ? 0 : 1 ); + + //ker_add_regularizer <<< BLOCKS, BLOCK_SIZE >>> + //(Hv, vector, lambda, num_classes * cols, 1./ (real)rows ); + ker_add_regularizer <<< rblocks, BLOCK_SIZE >>> + (Hv, vector, lambda, num_classes * cols, 1. ); + //(Hv, vector, lambda, num_classes * cols, 1./ ((real)rows * num_classes) ); + cudaThreadSynchronize (); + cudaCheckError (); + } +} + + +/////////////////////// +//OPTIMIZED CODE HERE +/////////////////////// +int generateNonUniformSample( real *probs, real *scaleTerms, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr) +{ + int count = 0; + real *devIndices = devPtr + rows; + + getRandomVector( rows, NULL, devPtr); + + ker_compute_probs <<< BLOCKS, BLOCK_SIZE >>> + ( probs, rows, sampleSize, devPtr, devIndices ); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( hostPtr, devIndices, sizeof(real) * rows, + cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST); + + for (int i = 0; i < rows; i ++){ + if (hostPtr[i] != 0) + selIndices[ count ++] = i; + } + + //prepare scaleTerms here. + cuda_memset( scaleTerms, 0, sizeof(real) * rows, 0x99 ); + cuda_memset( devIndices, 0, sizeof(real) * rows, 0x99 ); + copy_host_device( selIndices, devIndices, sizeof(int) * count, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + int blocks = count / BLOCK_SIZE + + ((count % BLOCK_SIZE) == 0 ? 0 : 1 ); + ker_init_scaleTerms <<< blocks, BLOCK_SIZE >>> + ( scaleTerms, count, probs, (int *)devIndices ); + cudaThreadSynchronize (); + cudaCheckError (); + + return count; +} + +void computeRowProbabilities( SparseDataset *spfeatures, real *features, int rows, int cols, int numclasses, + real *dHXW, real *rowNrms, real *probs, real *devPtr ) +{ + ker_compute_dHXW_nrm <<< BLOCKS, BLOCK_SIZE >>> + ( dHXW, rowNrms, rows, numclasses); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduce to compute the sum + reduce <<< BLOCKS, BLOCK_SIZE, WARP_SIZE * sizeof (real) >>> + (dHXW, devPtr, rows ); + cudaThreadSynchronize (); + cudaCheckError (); + + reduce <<< 1, BLOCKS_POW_2, WARP_SIZE * sizeof (real) >>> + (devPtr, devPtr + BLOCK_SIZE, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + ker_normalize <<< BLOCKS, BLOCK_SIZE >>> + (dHXW, rows, devPtr + BLOCK_SIZE, probs ); + cudaThreadSynchronize (); + cudaCheckError (); +} + +void computeRowNorms( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr ) +{ + if (features != NULL) { + ker_row_norms <<< BLOCKS, BLOCK_SIZE >>> + ( features, rows, cols, rowNrms ); + cudaThreadSynchronize (); + cudaCheckError (); + } else { + cudaMemcpy( spfeatures->valPtr, spfeatures->sortedVals, + sizeof(real) * spfeatures->nnz, cudaMemcpyDeviceToDevice ); + + int blocks = spfeatures->nnz / (BLOCK_SIZE) + + ((spfeatures->nnz % (BLOCK_SIZE)) == 0 ? 0 : 1 ); + ker_sqr_elements <<< blocks, BLOCK_SIZE >>> + (spfeatures->valPtr, spfeatures->nnz, 1, devPtr); + cudaThreadSynchronize (); + cudaCheckError (); + + //matvec here. for row sums + real alpha = 1.0; + real beta = 0; + + //init the vector here. + blocks = cols / BLOCK_SIZE + (( cols % BLOCK_SIZE == 0) ? 0 : 1 ); + ker_init_ones <<< blocks, BLOCK_SIZE >>> + ( devPtr , cols ); + cudaThreadSynchronize (); + cudaCheckError (); + + cudaMemset( rowNrms, 0, sizeof(real) * rows ); + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->valPtr, spfeatures->rowCsrPtr, + spfeatures->colPtr, devPtr, &beta, rowNrms) + ); + ker_sqrt_elements <<< BLOCKS, BLOCK_SIZE >>> + ( rowNrms, rows); + cudaThreadSynchronize (); + cudaCheckError (); + } +} + + +void computeHXW (SparseDataset *spfeatures, real *features, int rows, int cols, int num_classes, + real *weights, real *XW, int subSampling ) { + + + real alpha; + real beta; + + alpha = 1.0; + beta = 0; + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + rows, num_classes, cols, + &alpha, features, rows, + weights, cols, &beta, XW, rows) ); + } else { + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, num_classes, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, + spfeatures->colPtr, weights, cols, &beta, XW, rows ) + ); + } + + + if (subSampling >= 1){ + int blocks = rows / BLOCK_SIZE + (((rows % BLOCK_SIZE) == 0) ? 0 : 1 ); + ker_compute_HXW <<< blocks, BLOCK_SIZE >>> + ( XW, rows, cols, num_classes, 1); + } else { + ker_compute_HXW <<< BLOCKS, BLOCK_SIZE >>> + ( XW, rows, cols, num_classes, 1); + } + cudaThreadSynchronize (); + cudaCheckError (); + +/* + ker_hx_ProbabilityTerms <<>> + (features, weights, rows, cols, num_classes, XW); + cudaThreadSynchronize (); + cudaCheckError (); +*/ +} + + +void computeExpSum( real *XW, int rows, int cols, int num_classes, real *expSumVec ) +{ + ker_compute_expsum <<< BLOCKS, BLOCK_SIZE >>> + ( XW, rows, cols, num_classes, expSumVec, 1 ); + cudaThreadSynchronize (); + cudaCheckError (); +} + +void softmax_multiclass_gx_subsampled (SparseDataset *spfeatures, real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *gradient, real *devPtr, real *hostPtr, real *pageLckPtr, + SparseDataset *spGradientSample, real *gradientDataset, SparseDataset *spSampledGradientTrain, + real *gradientLabels, int sampleSize, int samplingType) +{ + real *HXW = devPtr; + real *hxwInd = HXW + rows * num_classes; + + int blocks; + real alpha; + real beta; + + //computeHXW Here. + computeHXW( spSampledGradientTrain, gradientDataset, sampleSize, cols, num_classes, weights, HXW, 1 ); + + blocks = sampleSize / BLOCK_SIZE + ((( sampleSize % BLOCK_SIZE ) == 0) ? 0 : 1 ); + ker_dx_softmax_ind <<< blocks, BLOCK_SIZE >>> + //(HXW, target, sampleSize, num_classes, hxwInd, 1); + (HXW, gradientLabels, sampleSize, num_classes, hxwInd, 1); + cudaThreadSynchronize (); + cudaCheckError (); + + //compute the gradient here. + alpha = 1.0; + beta = 0; + + //perform the X^T * HXWIND + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, + cols, num_classes, sampleSize, + &alpha, gradientDataset, sampleSize, + hxwInd, sampleSize, &beta, gradient, cols ) ); + } else { + cusparseCheckError( + cusparseDcsrmm ( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + sampleSize, num_classes, cols, + spSampledGradientTrain->nnz, &alpha, + spSampledGradientTrain->descr, spSampledGradientTrain->sortedVals, spSampledGradientTrain->rowCsrPtr, + spSampledGradientTrain->colPtr, hxwInd, sampleSize, &beta, gradient, cols ) ); + } + + + //non-uniform subsampling part here. + if (samplingType == 2) { + alpha = ((real)rows)/((real)sampleSize); + cublasCheckError( cublasDscal( cublasHandle, num_classes * cols, &alpha, gradient, 1) ); + } else if (samplingType == 1){ + alpha = ((real) rows) / ((real) sampleSize ); + cublasCheckError( cublasDscal( cublasHandle, num_classes * cols, &alpha, gradient, 1 )); + } + + //regularizer here. + cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &lambda, weights, 1, gradient, 1) ); +} + + +void softmax_multiclass_gx_optimized (SparseDataset *spfeatures, real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *HXW, real *gradient, + real *devPtr, real *hostPtr, real *pageLckPtr) +{ + /* + ker_dx_softmax_mt <<>> + (features, target, rows, cols, num_classes, weights, lambda, + HXW, expSumVec, devPtr); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduce across all blocks here. + int maxcomps = num_classes * cols ; + reduce_vector_mt <<< THREADS_PER_ROW, BLOCKS_POW_2, WARP_SIZE * sizeof(real) >>> + (devPtr, gradient, maxcomps, 1., BLOCKS ); + cudaThreadSynchronize (); + cudaCheckError (); + + //regularizer here. + if (lambda != 0) { + pageLckPtr[0] = lambda ; + cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &pageLckPtr[0], weights, 1, gradient, 1) ); + } + */ + + cuda_memset( gradient, 0, sizeof(real) * num_classes * cols, ERROR_MEM_SET ); + + real alpha; + real beta; + real *hxwInd = devPtr; + + real dxnrm; + real gxnrm; + + ker_dx_softmax_ind <<< BLOCKS , BLOCK_SIZE >>> + (HXW, target, rows, num_classes, hxwInd, 1); + + cudaDeviceSynchronize (); + cudaThreadSynchronize (); + cudaCheckError (); + + //cublasCheckError( cublasDnrm2( cublasHandle, rows, hxwInd + rows, 1, &dxnrm)); + //fprintf( stderr, "Norm of the Hxwind matrix is : %f \n", dxnrm * dxnrm ); + //printVector( hxwInd, 100, NULL ); + //printVector( target, 1000, NULL ); + //printVector( target, rows, NULL ); + + //compute the gradient here. + alpha = 1.0; + beta = 0; + + //perform the X^T * HXWIND + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, + cols, num_classes, rows, + &alpha, features, rows, + hxwInd, rows, &beta, gradient, cols ) ); + } else { + //fprintf( stderr, "Spfeatures nnz: %d \n", spfeatures->nnz ); + /* + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + rows, num_classes, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->valPtr, spfeatures->rowCsrPtr, + spfeatures->colPtr, HXW, rows, &beta, gradient, cols) ); + */ + + cusparseCheckError( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + rows, num_classes , cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, + spfeatures->colPtr, hxwInd, rows, &beta, gradient, cols ) ); + + //sparse matvec here. + /* + cusparseCheckError( + cusparseDcsrmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + cols, rows, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->cscValPtr, spfeatures->cscColPtr, + spfeatures->cscRowPtr, hxwInd, &beta, gradient ) + ); + cudaDeviceSynchronize (); + */ + + //writeVector( hxwInd, rows, "first_column.txt", 0 ); + + + //printVector( gradient, 20, NULL ); + //cublasCheckError( cublasDnrm2( cublasHandle, num_classes * cols, gradient, 1, &gxnrm)); + //fprintf ( stderr, "Gx norm: %f \n", gxnrm ); + + } + + //regularizer here. + cublasCheckError( cublasDaxpy( cublasHandle, num_classes * cols, &lambda, weights, 1, gradient, 1) ); +} + +void softmax_multiclass_hx_subsampled(SparseDataset *spfeatures, real *features, int rows, int cols, int num_classes, + real *weights, real *vector, real lambda, + real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *HXW, + SparseDataset *sampledfeatures, real *sampledDataset, + SparseDataset *spSampledHessianTrainSet, int sampleSize, real *scaleTerms, int samplingType) +{ + real *A = devPtr; + real *B = A + sampleSize * num_classes; + real *C = B + sampleSize * num_classes; + + real alpha,beta; + + //compute A = XV + alpha = 1; + beta = 0; + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + sampleSize, num_classes, cols, + &alpha, sampledDataset, sampleSize, + vector, cols, &beta, A, sampleSize) ); + } else { + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + sampleSize, num_classes, cols, spSampledHessianTrainSet->nnz, + &alpha, spSampledHessianTrainSet->descr, spSampledHessianTrainSet->sortedVals, + spSampledHessianTrainSet->rowCsrPtr, spSampledHessianTrainSet->colPtr, + //vector, cols, &beta, A, rows ) + vector, cols, &beta, A, sampleSize) + ); + + //FIXED-subsampling issue + } + + //compute B here. for sub sample part of the feautre matrix here. + computeHXW( spSampledHessianTrainSet, sampledDataset, sampleSize, cols, num_classes, weights, B, 1 ); + + //Compute C Here. + //ker_hx_C <<< BLOCKS, BLOCK_SIZE >>> + int blocks = sampleSize / BLOCK_SIZE + (((sampleSize % BLOCK_SIZE) == 0) ? 0 : 1); + if (samplingType == 2) { + ker_hx_C_scale <<< blocks, BLOCK_SIZE >>> + (A, B, C, sampleSize, cols, num_classes, scaleTerms); + } else { + ker_hx_C <<< blocks, BLOCK_SIZE >>> + (A, B, C, sampleSize, cols, num_classes); + } + + cudaThreadSynchronize (); + cudaCheckError (); + + //Compute the final Matvec Here. + alpha = 1.0; + beta = 0; + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, + cols, num_classes, sampleSize, + &alpha, sampledDataset, sampleSize, + C, sampleSize, &beta, Hv, cols ) ); + } else { + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + sampleSize, num_classes, cols, spSampledHessianTrainSet->nnz, + &alpha, spSampledHessianTrainSet->descr, spSampledHessianTrainSet->sortedVals, + spSampledHessianTrainSet->rowCsrPtr, spSampledHessianTrainSet->colPtr, + //C, rows, &beta, Hv, cols) + C, sampleSize, &beta, Hv, cols) + ); + + //FIXED subsampling issue + } + + if (samplingType == 1) { + //scale everything here. + alpha = ( ((real) rows) / ((real) sampleSize)); + cublasCheckError( cublasDscal( cublasHandle, num_classes * cols, &alpha, Hv, 1) ); + } + + if (lambda != 0) { + int rblocks = ((num_classes * cols) / BLOCK_SIZE) + + (((num_classes * cols) % BLOCK_SIZE == 0) ? 0 : 1 ); + + ker_add_regularizer <<< rblocks, BLOCK_SIZE >>> + (Hv, vector, lambda, num_classes * cols, 1. ); + cudaThreadSynchronize (); + cudaCheckError (); + } + +} + +void softmax_multiclass_hx_optimized (SparseDataset *spfeatures, real *features, int rows, int cols, int num_classes, + real *weights, real *vector, real lambda, + real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B ) +{ + real *A = devPtr; + real *C = A + rows * num_classes; + + real alpha,beta; + + //compute A = XV + alpha = 1; + beta = 0; + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + rows, num_classes, cols, + &alpha, features, rows, + vector, cols, &beta, A, rows) ); + } else { + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + rows, num_classes, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, + spfeatures->colPtr, vector, cols, &beta, A, rows ) + ); + } + + //Compute C Here. + ker_hx_C <<< BLOCKS, BLOCK_SIZE >>> + (A, B, C, rows, cols, num_classes); + cudaThreadSynchronize (); + cudaCheckError (); + + //Compute the final Matvec Here. + alpha = 1.0; + beta = 0; + if (features) { + cublasCheckError(cublasDgemm( cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, + cols, num_classes, rows, + &alpha, features, rows, + C, rows, &beta, Hv, cols ) ); + } else { + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + rows, num_classes, cols, spfeatures->nnz, + &alpha, spfeatures->descr, spfeatures->sortedVals, spfeatures->rowCsrPtr, + spfeatures->colPtr, C, rows, &beta, Hv, cols) + ); + } + + if (lambda != 0) { + int rblocks = ((num_classes * cols) / BLOCK_SIZE) + + (((num_classes * cols) % BLOCK_SIZE == 0) ? 0 : 1 ); + + ker_add_regularizer <<< rblocks, BLOCK_SIZE >>> + (Hv, vector, lambda, num_classes * cols, 1. ); + cudaThreadSynchronize (); + cudaCheckError (); + } + +} + +//////////////////// +//DONE HERE +//////////////////// + + + +GLOBAL void ker_softmax_predict( real *test_set, real *weights, + int rows, int cols, int numclasses, real *workspace) +{ + extern __shared__ real sh_vec[]; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + real dot = 0; + real sumexp; + real sumprob; + + //probability terms here. + sumexp = 0; + for (int i = 0; i < numclasses; i ++){ + if (threadIdx.x < cols) sh_vec[threadIdx.x] = weights[i * cols + threadIdx.x]; + __syncthreads (); + + if ( idx < rows ){ + dot = 0; + for (int j = 0; j < cols; j ++) dot += sh_vec[j] * test_set[ j * rows + idx ]; + sumexp += exp( dot ); + } + __syncthreads (); + } + + for (int c = 0; c < numclasses; c ++) { + if (threadIdx.x < cols) sh_vec[ threadIdx.x ] = weights[ c * cols + threadIdx.x ]; + __syncthreads (); + + if (idx < rows){ + dot = 0.; + for (int i = 0; i < cols; i ++) dot += test_set[i * rows + idx] * sh_vec[i]; + workspace[ idx * numclasses + c ] = exp(dot) / (1 + sumexp); + } + __syncthreads (); + } +} + +real softmax_predict(SparseDataset *spTest, real *test_set, real *test_labels, real *weights, int rows, int cols, int numclasses, + real *hostWorkspace, real *devWorkspace, int computeDevice, real *h_test_set) +{ + int pblocks = (rows / BLOCK_SIZE) + + ((rows % BLOCK_SIZE) == 0 ? 0 : 1 ); + real pmax = 0; + real matches = 0; + real nomatches = 0; + int pclass = -1; + real sumprob; + real dot, sumexp, maxdot; + + real *h_weights = hostWorkspace; + real *temp = h_weights + numclasses * cols; + +// fprintf( stderr, "ROWS -----> %d, COLS --------> %d, CLASSES ------> %d \n", rows, cols, numclasses ); + + if (computeDevice == 1) { + /* + ker_softmax_predict <<< pblocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(real) >>> + ( test_set, weights, rows, cols, numclasses, devWorkspace); + cudaThreadSynchronize (); + cudaCheckError (); + */ + computeHXW( spTest, test_set, rows, cols, numclasses, weights, devWorkspace, 0 ); + + copy_host_device( temp, devWorkspace, sizeof(real) * numclasses * rows, + cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST ); + } else { + + copy_host_device( h_weights, weights, sizeof(real) * numclasses * cols, + cudaMemcpyDeviceToHost, ERROR_MEMCPY_DEVICE_HOST ); + + for (int i = 0; i < rows; i ++) { + sumexp = 0; + for (int c = 0; c < numclasses; c ++) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += h_test_set[ j * rows + i ] * h_weights[ c * numclasses + j ]; + sumexp += exp ( dot ); + } + sumexp += 1.; + + for (int c = 0; c < numclasses; c ++) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += h_test_set[ j * rows + i ] * h_weights[ c * numclasses + j ]; + temp[ i * numclasses + c ] = exp( dot ) / sumexp; + } + } + } + +#ifdef __debug__ +// fprintf(stderr, " ---------- Class Probabilities ---------\n"); +#endif + + + // classify here, + // Which ever probability is maximum + +/* + int counters[numclasses+1], true_counters[numclasses + 1]; + memset( counters, 0, sizeof(int) * (numclasses + 1) ); + memset( true_counters, 0, sizeof(int) * (numclasses + 1) ); +*/ + + + for (int i = 0; i < rows; i ++){ + + pmax = 0; + pclass = -1; + sumprob = 0; + for (int c = 0; c < numclasses; c ++){ + + sumprob += temp[ c * rows + i ]; + if (pmax < temp[ c * rows + i ]){ + pmax = temp[c * rows + i]; + pclass = c + 1; + } + } + + /* + if (pclass < 0) { + fprintf( stderr, " Error in predicting classes ..... \n"); + exit(-1); + } + */ + +/* + true_counters[ (int)(test_labels[i]-1) ] ++; + if (pmax <= (1.- sumprob)) + counters[numclasses] ++; + else + counters[ pclass - 1 ] ++; +*/ + + + /* + if ( ((pmax <= (1. - sumprob)) && (test_labels[i] == (numclasses + 1))) || + (pclass == (int)(test_labels[i])) ){ + matches ++; + } + */ + if ((pmax <= (1. - sumprob)) && (test_labels[i] == (numclasses + 1))){ + matches ++; + } else if ((pmax > (1. - sumprob)) && (pclass == (int)(test_labels[i])) ) { + matches ++; + } else + nomatches ++; + + //for (int c = 0; c < numclasses; c ++) fprintf( stderr, " %e ", temp[ c * rows + i] ); + //fprintf( stderr, "\n"); + } + + +/* + + for (int i = 0; i < numclasses + 1; i ++) + fprintf( stderr, " Class: %d ---> Predicted: %d, TrueCount: %d \n", i + 1, counters[i], true_counters[i] ); + + fprintf( stderr, "Total matches -----> %f, %d, %f \n", matches, rows, nomatches ); +*/ + + //return ((real)matches/(real)rows) * 100.; + return (matches/(matches + nomatches)) * 100.; +} + +void computeErrors ( real *features, real *target, int rows, int cols, int numclasses, + real *devPtr, real *hostPtr, real *pageLckPtr, int numpoints) +{ + int offset = numclasses * cols % 4; + int count; + + real *constPoint = hostPtr; + real *hostPoint = constPoint + numclasses * cols + offset; + real *dx = hostPoint + numclasses * cols + offset; + real *ferror = dx + numclasses * cols + offset; + real *herror = ferror + numpoints; + real *dxs = herror + numpoints; + real *nextHostPtr = dxs + numpoints; + + real *devPoint = devPtr; + real *devDx = devPoint + numclasses * cols + offset; + real *gradient = devDx + numclasses * cols + offset; + real *Hv= gradient + numclasses * cols + offset; + real *devConstPoint = Hv + numclasses * cols + offset; + real *B = devConstPoint + numclasses * cols+ offset; + + //real *nextDevPtr = devConstPoint + numclasses * cols + offset; + real *nextDevPtr = B+ numclasses * rows+ offset; + + real *vv = pageLckPtr; + real *vhv = vv + 1; + real *dxnrm = vhv + 1; + real *nextPagePtr = dxnrm + 1; + + real f; + real f0; + real lambda = 0.; + + real alpha, beta; + + fprintf( stderr, "Number of random numbers to be generated: %d \n", numclasses * cols ); + + memset( constPoint, 0, sizeof(real) * numclasses * cols ); + for (int i = 0; i < numclasses * cols; i ++) constPoint[i] = 0.; + + copy_host_device( constPoint, devPoint, sizeof(real) * numclasses * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + copy_host_device( constPoint, devConstPoint, sizeof(real) * numclasses * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + getRandomVector( numclasses * cols, dx, nextDevPtr); + //for (int i = 0; i < numclasses * cols; i ++) dx[i] = 1.; + //count = readVector( dx, numclasses * cols, "dx_forest.txt"); + //fprintf( stderr, "Read the random vector from file as: %d \n", count ); + + //printHostVector( dx, numclasses * cols ); + + //f0 + f0 = softmax_multiclass_fx (NULL, features, target, rows, cols, numclasses, + devPoint, lambda, nextDevPtr, nextHostPtr, nextPagePtr); + + //g0 + softmax_multiclass_gx (features, target, rows, cols, + numclasses, devPoint, lambda, gradient, + nextDevPtr, nextHostPtr, nextPagePtr); + fprintf( stderr, "Gradient of the Softmax function is .... \n"); + //printVector( gradient, numclasses * cols, NULL ); + +/* + softmax_multiclass_hx (features, rows, cols, numclasses, + devConstPoint, devConstPoint, 0, nextDevPtr, nextHostPtr, nextPagePtr, Hv ); + printVector( Hv, numclasses * cols, NULL ); +*/ + + fprintf( stderr, "Starting the derivative test .. %f\n", f0); + + for (int i = 0; i < numpoints; i ++) { + + for (int j = 0; j < numclasses*cols; j ++) hostPoint[j] = constPoint[j] + dx[j]; + + copy_host_device( hostPoint, devPoint, sizeof(real) * numclasses * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_DEVICE_HOST); + copy_host_device( dx, devDx, sizeof(real) * numclasses * cols, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + //function evaluation here. + f = softmax_multiclass_fx (NULL, features, target, rows, cols, numclasses, + devPoint, lambda, nextDevPtr, nextHostPtr, nextPagePtr); + + //first order error + cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, gradient, 1, devDx, 1, vv) ); + ferror[i] = f - (f0 + *vv); + + //second order error + softmax_multiclass_hx (features, rows, cols, numclasses, + devConstPoint, devDx, 0, nextDevPtr, nextHostPtr, nextPagePtr, Hv, B, 1 ); + *vhv= 0; + cublasCheckError( cublasDdot( cublasHandle, numclasses * cols, devDx, 1, Hv, 1, vhv) ); + + //herror[i] = f - (f0 + *vv + (0.5 * (*vhv)) / (real)rows ); + herror[i] = f - (f0 + *vv + 0.5 * (*vhv) ); + + fprintf( stderr, "%d: f --> %e, vv --> %e, vhv--> %e, ferr: %e, herr: %e \n", + i, f, *vv, *vhv, ferror[i], herror[i] ); + +//exit(-1); + //dxs here. + *dxnrm = 0; + cublasCheckError( cublasDnrm2( cublasHandle, numclasses * cols, devDx, 1, dxnrm)); + dxs[i] = *dxnrm; + + for (int j = 0; j < numclasses*cols; j ++) dx[j] = dx[j] / 2.0; + //break; + } + + writeVector( ferror, numpoints, "./ferror.txt", 1 ); //host + writeVector( herror, numpoints, "./herror.txt", 1 ); //host + + //write dx.^2 here + for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 2.); + writeVector( constPoint, numpoints, "./dxs_2.txt", 1 ); //host + + //write dx.^3 here + for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 3.); + writeVector( constPoint, numpoints, "./dxs_3.txt", 1 ); //host +} + + +//////////////////////// +////HOST Computations Here. +//////////////////////// +real hostFunctionExact( real *features, real *target, real *weights, int numclasses, int rows, int cols) +{ + real logpart = 0; + real classpart = 0; + real dot, sumexp; + + real maxdot = 0; + + for (int i = 0; i < rows; i ++) { + + sumexp = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ c * cols + j ]; + + if (maxdot < dot) maxdot = dot; + + sumexp += exp( dot ); + } + logpart += log( 1 + sumexp ); + + + int myclass = (int)(target[ i ] - 1.); + + dot = 0; + if (myclass < numclasses) + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ myclass * cols + j ]; + + classpart += dot; + } + + return (logpart - classpart) / ((real) rows); +} + + + +real hostFunction( real *features, real *target, real *weights, int numclasses, int rows, int cols) +{ + real logpart = 0; + real classpart = 0; + real dot, alphax, maxdot, sumexp; + + for (int i = 0; i < rows; i ++) { + + maxdot = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ c * cols + j ]; + + if (dot > maxdot ) maxdot = dot; + } + + sumexp = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ c * cols + j ]; + + sumexp += exp( dot - maxdot ); + } + alphax = exp( -1. * (maxdot) ) + sumexp; + logpart += (maxdot + log( alphax )); + + + int myclass = (int)(target[ i ] - 1.); + + dot = 0; + if (myclass < numclasses) + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ myclass * cols + j ]; + + classpart += dot; + } + + //return (logpart - classpart) / ((real) rows); + return (logpart - classpart); +} + +void hostGradientExact( real *features, real *target, int numclasses, int rows, int cols, real *weights, real *gradient) +{ + int myclass = 0; + real dot = 0, sumexp = 0; + real pi; + + memset( gradient, 0, sizeof(real) * numclasses * cols ); + + for (int i = 0; i < rows; i ++) { + myclass = (int)(target[ i ] - 1.); + + sumexp = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ c * cols + j ]; + sumexp += exp( dot ); + } + + + for (int c = 0; c < numclasses; c ++){ + pi = 0; + for (int j = 0; j < cols; j ++) + pi += features[ j * rows + i ] * weights[ c * cols + j ]; + + pi = exp(pi) / (1 + sumexp); + + for (int j = 0; j < cols; j ++){ + gradient[ c * cols + j ] += (pi - ((myclass == c) ? 1. : 0.)) * features[ j * rows + i ]; + } + } + } + for (int i = 0; i < numclasses * cols; i ++) gradient[i] = gradient[i] / ((real) rows); +} + +void hostGradient( real *features, real *target, int numclasses, int rows, int cols, real *weights, real *gradient) +{ + int myclass = 0; + real dot = 0, maxdot = 0, sumexp = 0, alphax = 0; + real pi; + + memset( gradient, 0, sizeof(real) * numclasses * cols ); + + for (int i = 0; i < rows; i ++) { + sumexp = maxdot = alphax = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ c * cols + j ]; + if (dot > maxdot) maxdot = dot; + } + + myclass = (int)(target[ i ] - 1.); + + sumexp = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[ j * rows + i ] * weights[ c * cols + j ]; + sumexp += exp( dot - maxdot); + } + alphax = exp( -1. * (maxdot ) ) + sumexp; + + + for (int c = 0; c < numclasses; c ++){ + pi = 0; + for (int j = 0; j < cols; j ++) + pi += features[ j * rows + i ] * weights[ c * cols + j ]; + + pi = exp(pi - maxdot) / alphax; + + for (int j = 0; j < cols; j ++){ + gradient[ c * cols + j ] += (pi - ((myclass == c) ? 1. : 0.)) * features[ j * rows + i ]; + } + } + } + + //for (int i = 0; i < numclasses * cols; i ++) gradient[i] = gradient[i] / ((real) rows); +} + +void computeScale( real *features, real *target, real *weights, int numclasses, int rows, int cols, real *scale, int a, int b) +{ + real sumexp, pa, pb, dot; + real maxdot; + + if (a == b) { + for (int i = 0; i < rows; i ++) { + maxdot = 0; + for (int c = 0; c < numclasses; c ++) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ]; + if (maxdot < dot) maxdot = dot; + } + + sumexp = 0; + for (int c = 0; c < numclasses; c ++) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ]; + sumexp += exp( dot - maxdot ); + } + sumexp += exp( -1. * maxdot ); + + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ a * cols + j ]; + + scale [ i ] = (exp(dot - maxdot) / sumexp) * (1. - (exp(dot - maxdot)/sumexp)); + } + } + else { + for (int i = 0; i < rows; i ++) { + + maxdot = 0; + for (int c = 0; c < numclasses; c ++) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ]; + if (maxdot < dot) maxdot = dot; + } + + sumexp = 0; + for (int c = 0; c < numclasses; c ++) { + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ c * cols + j ]; + sumexp += exp( dot - maxdot); + } + sumexp += exp(-1 * maxdot ); + + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ a * cols + j ]; + + pa = exp( dot - maxdot) / sumexp; + + dot = 0; + for (int j = 0; j < cols; j ++) dot += features[ j * rows + i ] * weights[ b * cols + j ]; + pb = exp( dot - maxdot) / sumexp; + + scale[ i ] = -1. * pa * pb; + } + } +} + +void computescalex (real *features, real *target, int numclasses, int rows, int cols, real *scale, real *temp ) +{ + for (int i = 0; i < rows; i ++) + for (int j = 0; j < cols; j ++) + temp[ j * rows + i ] = scale[ i ] * features[ j * rows + i ]; + +} +void computextscale (real *features, real *target, int numclasses, int rows, int cols, real *temp, real *block ) +{ + memset( block, 0, sizeof(real) * cols * cols ); + + for (int i = 0; i < cols; i ++){ + for (int j = 0; j < cols; j ++) { + for (int k = 0; k < rows; k ++) { + block[ i * cols + j ] += + features[i * rows + k] * temp[j * rows + k]; + } + //block[ i * cols + j ] /= (real) rows; + } + } + + + //column major * times * column major format here. +/* + for (int i = 0; i < cols; i++) { + for (int j = 0; j < cols; j ++){ + for (int k = 0; k < rows; k ++){ + block[ i * cols + j ] += + features[ i * rows + k ] * temp[ j * rows + k ]; + } + } + } + + for (int i = 0; i < cols; i++) { + for (int j = 0; j < cols; j ++){ + block[ i * cols + j ] = block[ i * cols + j ] / (real) rows; + } + } +*/ +} + + +void hostHessian( real *features, real *target, int numclasses, int rows, int cols, real *weights, real *hessian, real *s ) +{ + real *scale = s; + real *temp = scale + rows; + real *block = temp + rows * cols; + real *offset; + + memset( hessian, 0, sizeof(real) * numclasses * numclasses * cols * cols ); + + for (int i = 0; i < numclasses; i ++){ + for (int j = 0; j < numclasses; j ++){ + computeScale ( features, target, weights, numclasses, rows, cols, scale, i, j ); + //for ( int k = 0; k < rows; k ++) scale[k] = 1.; + + computescalex( features, target, numclasses, rows, cols, scale, temp ); + computextscale( features, target, numclasses, rows, cols, temp, block ); + + offset = hessian + i * (numclasses * cols) * cols + j * cols; + for (int k = 0; k < cols; k ++) + memcpy( offset + k * numclasses * cols, block + k * cols, sizeof(real) * cols ); + } + } +} + +void hostHessianVector( real *features, real *target, real *weights, int numclasses, int rows, int cols, + real *vector, real *result, real *temp) { + real *A = temp; + real *B = A + rows * numclasses; + real *C = B + rows * numclasses; + + real dot, sumexp, maxdot; + real pw, sum; + + memset( result, 0, sizeof(real) * numclasses * cols ); + + //compute A. - stored in column major order + for (int i = 0; i < rows; i ++){ + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++){ + dot += features[ j * rows + i ] * vector[ c * cols + j]; + } + //A[ i * numclasses + c ] = dot; + A[ c * rows + i ] = dot; + } + } + + //compute B here. - stored in column major order + for (int i = 0; i < rows; i ++){ + maxdot = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[j * rows + i ] * weights[ c * cols + j ]; + + if (maxdot < dot) maxdot = dot; + } + + sumexp = 0; + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[j * rows + i ] * weights[ c * cols + j ]; + + sumexp += exp( dot - maxdot ); + } + sumexp += exp( -1 * maxdot ); + + for (int c = 0; c < numclasses; c ++){ + dot = 0; + for (int j = 0; j < cols; j ++) + dot += features[j * rows + i ] * weights[ c * cols + j ]; + + pw = exp( dot - maxdot ) / sumexp; + //B[ i * numclasses + c ] = pw; + B[ c * rows + i ] = pw; + } + } + + //compute C here. - stored in column major order + for (int i = 0; i < rows; i ++){ + sum = 0; + for (int k = 0; k < numclasses; k ++) + //sum += A[ i * numclasses + k ] * B [ i * numclasses + k ]; + sum += A[ k * rows + i ] * B [ k * rows + i ]; + + for (int j = 0; j < numclasses; j ++){ + /* + C[ i * numclasses + j ] = + A[ i * numclasses + j ] * B [ i * numclasses + j ] - + B[ i * numclasses + j ] * sum; + */ + C[ j * rows + i ] = + A[ j * rows + i ] * B [ j * rows + i ] - + B[ j * rows + i ] * sum; + + } + } + + //compute Hessian * vector here. +/* + for (int i = 0; i < cols; i ++){ + for (int j = 0; j < numclasses; j ++){ + for (int k = 0; k < rows; k ++) { + result[ j * cols + i ] += + features[ i * rows + k ] * C[k * numclasses + j ]; + } + } + } +*/ + + + + //Compute XT * C = stored in column major format + for (int i = 0; i < cols; i ++){ + for (int j = 0; j < numclasses; j ++){ + for (int k = 0; k < rows; k ++){ + result[ j * cols + i ] += features[ i * rows + k ] * C[ j * rows + k ]; + } + } + } +} + +void hostDerivativeTest ( real *features, real *target, int rows, int cols, int numclasses, + real *hostPtr, real *devPtr, int numpoints) +{ + int offset = (numclasses * cols) % 4; + + real *constPoint = hostPtr; + real *hostPoint = constPoint + numclasses * cols + offset; + real *dx = hostPoint + numclasses * cols + offset; + real *ferror = dx + numclasses * cols + offset; + real *dxs = ferror + numpoints; + real *gradient = dxs + numpoints; + real *hessian = gradient + numclasses * cols; + real *herror = hessian + numclasses * numclasses * cols * cols ; + real *Hv = herror + numpoints; + real *hexplicit = Hv + numclasses * cols; + real *nextHostPtr = hexplicit + numpoints; + + real f; + real f0; + real vv = 0; + real vhve, vhv, sum; + real dxnrm = 0; + +/* + for (int i = 0; i < cols; i ++) + for (int j = 0; j < rows; j ++) + features[i * rows + j ] = i+1; + + printHostVector( features, 10 ); + printHostVector( features + rows, 10 ); + printHostVector( features + 2*rows, 10 ); +*/ + + + fprintf( stderr, "Number of random numbers to be generated: %d, %d, %d \n", (numclasses) * cols, rows, cols ); + + memset( constPoint, 0, sizeof(real) * (numclasses) * cols ); + for (int i = 0; i < (numclasses) * cols; i ++ ) constPoint[i] = 1.0; + //getRandomVector((numclasses) * cols, dx, devPtr); + //for (int i = 0; i < (numclasses) * cols; i ++ ) dx[i] = 1.0; + int count = readVector( dx, numclasses * cols, "dx_forest.txt", 0); + fprintf( stderr, "Total Points read from file: %d \n", count ); + + f0 = hostFunction(features, target, constPoint, numclasses, rows, cols ); + hostGradient(features, target, numclasses, rows, cols, constPoint, gradient); + //hostHessian( features, target, numclasses, rows, cols, constPoint, hessian, nextHostPtr); + +/* + fprintf( stderr, "Hessian Matrix.... \n"); + for (int i = 0; i < numclasses * cols; i ++){ + for (int j = 0; j < numclasses * cols; j ++) + fprintf (stderr, " %e ", hessian[ i * numclasses * cols + j ] ); + fprintf (stderr, "\n"); + } + + fprintf( stderr, "Explicit Hessian vecotr product \n"); + for (int j = 0; j < numclasses * cols; j ++) { + sum = 0; + for (int k = 0; k < numclasses * cols; k ++) + sum += hessian[ j * numclasses * cols + k ] * dx[k]; + fprintf( stderr, " %e ", sum ); + } + fprintf( stderr, "\n"); + + + + hostHessianVector( features, target, constPoint, numclasses, rows, cols, dx, Hv, nextHostPtr ); + fprintf( stderr, "Hessian vecotr product \n"); + printHostVector( Hv, numclasses * cols ); + + exit (-1); +*/ + + fprintf( stderr, " Function at 0: %f \n", f0); + //printHostVector( gradient, numclasses * cols ); + + + for (int i = 0; i < numpoints; i ++) { + for (int j = 0; j < (numclasses)*cols; j ++) hostPoint[j] = constPoint[j] + dx[j]; + + f = hostFunction(features, target, hostPoint, numclasses, rows, cols ); + + /*first order error*/ + vv = 0; + for (int j = 0; j < (numclasses) * cols; j ++) vv += gradient[j] * dx[j]; + ferror[i] = f - (f0 + vv); + + /* second order error */ + vhv = vhve = 0; + + /* + for (int j = 0; j < numclasses * cols; j ++) { + sum = 0; + for (int k = 0; k < numclasses * cols; k ++) + sum += hessian[ j * numclasses * cols + k ] * dx[k]; + + //fprintf( stderr, " %e ", sum ); + vhve += dx[j] * sum; + } + //fprintf( stderr, "\n"); + */ + + + hostHessianVector( features, target, constPoint, numclasses, rows, cols, dx, Hv, nextHostPtr ); + //printHostVector( Hv, numclasses * cols ); + + //for (int j = 0; j < numclasses * cols ; j ++) vhv += Hv[ j ] * dx[ j ] / (real) rows; + for (int j = 0; j < numclasses * cols ; j ++) vhv += Hv[ j ] * dx[ j ]; + + //hexplicit[i] = f - (f0 + vv + 0.5 * vhve); + herror[i] = f - (f0 + vv + 0.5 * vhv); + + /*dxs here. */ + dxnrm = 0; + for (int j = 0; j < (numclasses) * cols; j ++) dxnrm += dx[j] * dx[j]; + dxs[i] = sqrt( dxnrm ); + + for (int j = 0; j < (numclasses)*cols; j ++) dx[j] = dx[j] / 2.0; + + fprintf( stderr, "%d: f : %e, vv : %e, ferr: %e, dx_2: %e, vhv: %e, herr: %e, dx_3: %e\n", + i, f, vv, ferror[i], pow(dxs[i], 2.0), vhv, herror[i], pow(dxs[i], 3.) ); + //fprintf( stderr, "%d: f : %e, vv : %e, ferr: %e, dx_2: %e, vhve: %e, herr: %e, dx_3: %e\n", + // i, f, vv, ferror[i], pow(dxs[i], 2.0), vhve, hexplicit[i], pow(dxs[i], 3.) ); + + } + + writeVector( ferror, numpoints, "./ferror.txt", 1 ); /* host */ + writeVector( herror, numpoints, "./herror.txt", 1 ); /* host */ + //writeVector( hexplicit, numpoints, "./hexplicit.txt", 1 ); /* host */ + + /* write dx.^2 here */ + for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 2.); + writeVector( hostPtr, numpoints, "./dxs_2.txt", 1 ); /* host */ + + for (int j = 0; j < numpoints; j ++) hostPtr[j] = pow(dxs[j], 3.); + writeVector( hostPtr, numpoints, "./dxs_3.txt", 1 ); /* host */ +} diff --git a/code/cuda/RC-FINAL-5/softmax_multiclass.h b/code/cuda/RC-FINAL-5/softmax_multiclass.h new file mode 100644 index 0000000..a48995e --- /dev/null +++ b/code/cuda/RC-FINAL-5/softmax_multiclass.h @@ -0,0 +1,58 @@ +#ifndef __SOFTMAX_MULTICLASS_H__ +#define __SOFTMAX_MULTICLASS_H__ + +#include "cuda_types.h" +#include "dataset.h" + + +int generateNonUniformSample( real *probs, real *scaleTerm, int rows, int sampleSize, int *selIndices, real *devPtr, real *hostPtr); +void computeRowProbabilities( SparseDataset *spfeatures, real *features, int rows, int cols, int numclasses, + real *dHXW, real *rowNrms, real *probs, real *devPtr ); +void computeRowNorms( SparseDataset *spfeatures, real *features, int rows, int cols, real *rowNrms, real *devPtr ); +void computeDiagHXW( real *XW, int rows, int num_classes, real *dXW ); + + +real softmax_multiclass_fx (SparseDataset *, real *, real *, int , int , int, real *, + real , real *, real *, real *); +void softmax_multiclass_gx (real *, real *, int , int , + int , real *, real , real *, + real *, real *, real *); +void softmax_multiclass_hx (real *, int , int , int , + real *, real *, real , + real *, real *, real *, real *, real *, int); + +void computeHXW (SparseDataset *, real *features, int rows, int cols, int num_classes, real *weights, real *XW, int subSampling ); + +void computeExpSum( real *XW, int rows, int cols, int num_classes, real *expSumVec ); + +void softmax_multiclass_gx_optimized (SparseDataset *, real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *XW, real *gradient, + real *devPtr, real *hostPtr, real *pageLckPtr); + +void softmax_multiclass_gx_subsampled(SparseDataset *, real *features, real *target, int rows, int cols, int num_classes, + real *weights, real lambda, real *gradient, + real *devPtr, real *hostPtr, real *pageLckPtr, + SparseDataset *, real *, SparseDataset *, real *, int, int ); + +void softmax_multiclass_hx_subsampled(SparseDataset *, real *features, int rows, int cols, int num_classes, + real *weights, real *vector, real lambda, + real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B, + SparseDataset *, real *, SparseDataset *, int, real *, int ); + +void softmax_multiclass_hx_optimized (SparseDataset *, real *features, int rows, int cols, int num_classes, + real *weights, real *vector, real lambda, + real *devPtr, real *hostPtr, real *pageLckPtr, real *Hv, real *B ); + +real softmax_predict(SparseDataset *, real *, real *, real *, int , int , int , + real *, real *, int, real *); + +void expTest( real *results, int count, real *host); + +void computeErrors ( real *, real *, int , int , int , + real *, real *, real *, int ); + + +void hostDerivativeTest ( real *, real *, int , int , int , + real *, real *, int); + +#endif diff --git a/code/cuda/RC-FINAL-5/sparse_dataset.cu b/code/cuda/RC-FINAL-5/sparse_dataset.cu new file mode 100644 index 0000000..22d01b7 --- /dev/null +++ b/code/cuda/RC-FINAL-5/sparse_dataset.cu @@ -0,0 +1,178 @@ + +#include "cuda_types.h" +#include "cuda_utils.h" +#include "sparse_dataset.h" + +void initMatDescriptors( DeviceDataset *d ) +{ + //Train + cusparseCheckError ( cusparseCreateMatDescr(&(d->spTrain.descr)) ); + cusparseCheckError ( cusparseSetMatIndexBase(d->spTrain.descr, CUSPARSE_INDEX_BASE_ZERO) ); + cusparseCheckError ( cusparseSetMatType(d->spTrain.descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); + + //Test + cusparseCheckError ( cusparseCreateMatDescr(&(d->spTest.descr)) ); + cusparseCheckError ( cusparseSetMatIndexBase(d->spTest.descr, CUSPARSE_INDEX_BASE_ZERO) ); + cusparseCheckError ( cusparseSetMatType(d->spTest.descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); +} + +void initMatDescriptorsForSampling( DeviceDataset *d ) { + + //SubSampling - Hessian + cusparseCheckError ( cusparseCreateMatDescr(&(d->spHessianSample.descr)) ); + cusparseCheckError ( cusparseSetMatIndexBase(d->spHessianSample.descr, CUSPARSE_INDEX_BASE_ZERO) ); + cusparseCheckError ( cusparseSetMatType(d->spHessianSample.descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); + + //gradient + cusparseCheckError ( cusparseCreateMatDescr(&(d->spGradientSample.descr)) ); + cusparseCheckError ( cusparseSetMatIndexBase(d->spGradientSample.descr, CUSPARSE_INDEX_BASE_ZERO) ); + cusparseCheckError ( cusparseSetMatType(d->spGradientSample.descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); +} + +void initMatDescriptorsForSparseSampling( DeviceDataset *d ) { + + //SubSampling - Hessian + cusparseCheckError ( cusparseCreateMatDescr(&(d->spSampledHessianTrain.descr)) ); + cusparseCheckError ( cusparseSetMatIndexBase(d->spSampledHessianTrain.descr, CUSPARSE_INDEX_BASE_ZERO) ); + cusparseCheckError ( cusparseSetMatType(d->spSampledHessianTrain.descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); + + //gradient + cusparseCheckError ( cusparseCreateMatDescr(&(d->spSampledGradientTrain.descr)) ); + cusparseCheckError ( cusparseSetMatIndexBase(d->spSampledGradientTrain.descr, CUSPARSE_INDEX_BASE_ZERO) ); + cusparseCheckError ( cusparseSetMatType(d->spSampledGradientTrain.descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); +} + +void convertGradientSampleToCSR (SparseDataset *spGradientSample, int sampleSize, int cols, real *devPtr) { + + //make sure that the data is sorted here. + size_t pBufferSizeInBytes = 0; + void* pBuffer = (void *)devPtr; + + //Sampled Dataset Here. + cusparseCheckError( + cusparseXcoosort_bufferSizeExt( + cusparseHandle, sampleSize, cols, spGradientSample->nnz, + spGradientSample->rowPtr, spGradientSample->colPtr, &pBufferSizeInBytes ) ); + + cusparseCheckError( + cusparseCreateIdentityPermutation( cusparseHandle, spGradientSample->nnz, spGradientSample->P) ); + + cusparseCheckError( + cusparseXcoosortByRow( cusparseHandle, sampleSize, cols, spGradientSample->nnz, + spGradientSample->rowPtr, spGradientSample->colPtr, spGradientSample->P, pBuffer ) ); + + cusparseCheckError( + cusparseDgthr( cusparseHandle, spGradientSample->nnz, spGradientSample->valPtr, + spGradientSample->sortedVals, spGradientSample->P, CUSPARSE_INDEX_BASE_ZERO ) ); + + //convert to csr format. + cusparseCheckError( + cusparseXcoo2csr( cusparseHandle, spGradientSample->rowPtr, spGradientSample->nnz, sampleSize, + spGradientSample->rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) + ); + + //fprintf( stderr, "Converting gradient to CSR .... \n"); +} + + +void convertHessianSampleToCSR (SparseDataset *spHessianSample, int sampleSize, int cols, real *devPtr) { + + //make sure that the data is sorted here. + size_t pBufferSizeInBytes = 0; + void* pBuffer = (void *)devPtr; + + //Sampled Dataset Here. + cusparseCheckError( + cusparseXcoosort_bufferSizeExt( + cusparseHandle, sampleSize, cols, spHessianSample->nnz, + spHessianSample->rowPtr, spHessianSample->colPtr, &pBufferSizeInBytes ) ); + + cusparseCheckError( + cusparseCreateIdentityPermutation( cusparseHandle, spHessianSample->nnz, spHessianSample->P) ); + + cusparseCheckError( + cusparseXcoosortByRow( cusparseHandle, sampleSize, cols, spHessianSample->nnz, + spHessianSample->rowPtr, spHessianSample->colPtr, spHessianSample->P, pBuffer ) ); + + cusparseCheckError( + cusparseDgthr( cusparseHandle, spHessianSample->nnz, spHessianSample->valPtr, + spHessianSample->sortedVals, spHessianSample->P, CUSPARSE_INDEX_BASE_ZERO ) ); + + //convert to csr format. + cusparseCheckError( + cusparseXcoo2csr( cusparseHandle, spHessianSample->rowPtr, spHessianSample->nnz, sampleSize, + spHessianSample->rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) + ); + + //fprintf( stderr, "Converting hessian to CSR .... \n"); +} + +void convertToCSR( DeviceDataset *d, real *devPtr ) +{ + //make sure that the data is sorted here. + size_t pBufferSizeInBytes = 0; + void* pBuffer = (void *)devPtr; + + //Train Dataset Here. + cusparseCheckError( + cusparseXcoosort_bufferSizeExt( + cusparseHandle, d->rows, d->cols, d->spTrain.nnz, + d->spTrain.rowPtr, d->spTrain.colPtr, &pBufferSizeInBytes ) ); + fprintf( stderr, "Memory needed to sort coo data --> %d \n", pBufferSizeInBytes ); + + cusparseCheckError( + cusparseCreateIdentityPermutation( cusparseHandle, d->spTrain.nnz, d->spTrain.P) ); + + cusparseCheckError( + cusparseXcoosortByRow( cusparseHandle, d->rows, d->cols, d->spTrain.nnz, + d->spTrain.rowPtr, d->spTrain.colPtr, d->spTrain.P, pBuffer ) ); + + cusparseCheckError( + cusparseDgthr( cusparseHandle, d->spTrain.nnz, d->spTrain.valPtr, + d->spTrain.sortedVals, d->spTrain.P, CUSPARSE_INDEX_BASE_ZERO ) ); + + //convert to csr format. + cusparseCheckError( + cusparseXcoo2csr( cusparseHandle, d->spTrain.rowPtr, d->spTrain.nnz, d->rows, + d->spTrain.rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) + ); + + + //Test Dataset here. + cusparseCheckError( + cusparseXcoosort_bufferSizeExt( + cusparseHandle, d->rows, d->cols, d->spTest.nnz, + d->spTest.rowPtr, d->spTest.colPtr, &pBufferSizeInBytes ) ); + fprintf( stderr, "Memory needed to sort coo data --> %d \n", pBufferSizeInBytes ); + + cusparseCheckError( + cusparseCreateIdentityPermutation( cusparseHandle, d->spTest.nnz, d->spTest.P) ); + + cusparseCheckError( + cusparseXcoosortByRow( cusparseHandle, d->rows, d->cols, d->spTest.nnz, + d->spTest.rowPtr, d->spTest.colPtr, d->spTest.P, pBuffer ) ); + + cusparseCheckError( + cusparseDgthr( cusparseHandle, d->spTest.nnz, d->spTest.valPtr, + d->spTest.sortedVals, d->spTest.P, CUSPARSE_INDEX_BASE_ZERO ) ); + + //convert to csr format. + cusparseCheckError( + cusparseXcoo2csr( cusparseHandle, d->spTest.rowPtr, d->spTest.nnz, d->rows, + d->spTest.rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) + ); + +/* + cusparseCheckError( + cusparseXcoo2csr( cusparseHandle, d->spTest.rowPtr, d->spTest.nnz, d->testSize, + d->spTest.rowCsrPtr, CUSPARSE_INDEX_BASE_ZERO ) + ); + + //convert the csr matrix to csc matrix here. + cusparseCheckError( + cusparseDcsr2csc( cusparseHandle, d->rows, d->cols, d->spTrain.nnz, + d->spTrain.valPtr, d->spTrain.rowCsrPtr, d->spTrain.colPtr, + d->spTrain.cscValPtr, d->spTrain.cscRowPtr, d->spTrain.cscColPtr, + CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO ) ); +*/ +} diff --git a/code/cuda/RC-FINAL-5/sparse_dataset.h b/code/cuda/RC-FINAL-5/sparse_dataset.h new file mode 100644 index 0000000..49b9fa6 --- /dev/null +++ b/code/cuda/RC-FINAL-5/sparse_dataset.h @@ -0,0 +1,14 @@ +#ifndef __H_SPARSE_DATASET__ +#define __H_SPARSE_DATASET__ + +#include "dataset.h" + +void convertToCSR( DeviceDataset *, real * ); +void convertHessianSampleToCSR (SparseDataset *spSampleHessian, int sampleSize, int cols, real *devPtr); +void convertGradientSampleToCSR (SparseDataset *spSampleHessian, int sampleSize, int cols, real *devPtr); + +void initMatDescriptors( DeviceDataset *d ); +void initMatDescriptorsForSampling( DeviceDataset *d ); +void initMatDescriptorsForSparseSampling( DeviceDataset *d ); + +#endif diff --git a/code/cuda/RC-FINAL-5/subsampling_helpers.cu b/code/cuda/RC-FINAL-5/subsampling_helpers.cu new file mode 100644 index 0000000..538e982 --- /dev/null +++ b/code/cuda/RC-FINAL-5/subsampling_helpers.cu @@ -0,0 +1,122 @@ + +#include + +#include "cuda_utils.h" +#include "print_utils.h" +#include "gen_random.h" + +GLOBAL void kerInitSampleMatrix( int *row, int *col, real *val, real *labels, real *srcLabels, int count, int offset, int maxRows ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + row[ idx ] = idx; + val[ idx ] = 1.; + + //reshuffle the labels here. + labels[ idx ] = srcLabels[ col[ idx ] ] ; + } +} + +GLOBAL void kerInitSampleMatrixNoLabels( int *row, int *col, real *val, int count, int offset, int maxRows ) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + row[ idx ] = idx; + val[ idx ] = 1.; + } +} + +void initSubSampledHessian( int offset, int rows, SparseDataset *sampledSet, real *sampledLabels, real *srcLabels, int sampledSize ){ + + int blocks = (sampledSize / BLOCK_SIZE) + + (((sampledSize % BLOCK_SIZE) == 0) ? 0 : 1) ; + + if (sampledLabels == NULL && srcLabels == NULL){ + kerInitSampleMatrixNoLabels <<< blocks, BLOCK_SIZE >>> + (sampledSet->rowPtr, sampledSet->colPtr, sampledSet->valPtr, sampledSize, offset, rows ); + } else { + kerInitSampleMatrix <<< blocks, BLOCK_SIZE >>> + (sampledSet->rowPtr, sampledSet->colPtr, sampledSet->valPtr, sampledLabels, srcLabels, + sampledSize, offset, rows ); + } + cudaThreadSynchronize (); + cudaCheckError (); +} + +void prepareForNonUniformSampling (SparseDataset *samplingMat, int sampleSize, int *indices) { + + copy_host_device( indices, samplingMat->colPtr, sizeof(int) * sampleSize, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + initSubSampledHessian( -1, -1, samplingMat, NULL, NULL, sampleSize); +} + +void prepareForSampling (SparseDataset *sampledGradient, real *sampledLabels, real *srcLabels, int rows, int sampleSize, int *hostPtr) { + + int startRow = -1; + + //generate random rows here for sampling. + //genRandomVector( hostPtr, sampleSize, rows ); + genRandomVector( hostPtr, sampleSize, rows - 1 ); + + copy_host_device( hostPtr, sampledGradient->colPtr, sizeof(int) * sampleSize, + cudaMemcpyHostToDevice, ERROR_MEMCPY_HOST_DEVICE ); + + startRow = rand () % rows; + initSubSampledHessian( startRow, rows, sampledGradient, sampledLabels, srcLabels, sampleSize); +} + +void sampleDataset ( SparseDataset *spSampledGradient, real *dataset, + int rows, int cols, int num_classes, + real *subSampledGradient, int sampleSize ) +{ + real alpha = 1.0; + real beta = 0; + + cusparseCheckError ( + cusparseDcsrmm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + sampleSize, cols, rows, spSampledGradient->nnz, + &alpha, spSampledGradient->descr, spSampledGradient->sortedVals, spSampledGradient->rowCsrPtr, + spSampledGradient->colPtr, dataset, rows, &beta, subSampledGradient, sampleSize) + ); +} + +void sampleSparseDataset ( SparseDataset *spSampler, SparseDataset *spDataset, + int rows, int cols, int num_classes, + SparseDataset *spGradientSample, int sampleSize ) +{ + int *nnzHostPtr = &spGradientSample->nnz; + int baseC = 0; + + cusparseCheckError( + cusparseSetPointerMode( cusparseHandle, CUSPARSE_POINTER_MODE_HOST) ); + + cusparseCheckError ( + cusparseXcsrgemmNnz( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + //sampleSize, cols, sampleSize, + sampleSize, cols, rows, + spSampler->descr, spSampler->nnz, spSampler->rowCsrPtr, spSampler->colPtr, + spDataset->descr, spDataset->nnz, spDataset->rowCsrPtr, spDataset->colPtr, + spGradientSample->descr, spGradientSample->rowCsrPtr, nnzHostPtr + ) ); + + if (nnzHostPtr != NULL){ + spGradientSample->nnz = *nnzHostPtr; + } else { + cudaMemcpy( &spGradientSample->nnz, spGradientSample->rowCsrPtr + sampleSize, sizeof(int), + cudaMemcpyDeviceToHost ); + cudaMemcpy( &baseC, spGradientSample->rowCsrPtr, sizeof(int), cudaMemcpyDeviceToHost ); + + spGradientSample->nnz -= baseC; + } + + cusparseCheckError ( + cusparseDcsrgemm( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + //sampleSize, cols, sampleSize, + sampleSize, cols, rows, + spSampler->descr, spSampler->nnz, spSampler->sortedVals, spSampler->rowCsrPtr, spSampler->colPtr, + spDataset->descr, spDataset->nnz, spDataset->sortedVals, spDataset->rowCsrPtr, spDataset->colPtr, + spGradientSample->descr, spGradientSample->sortedVals, + spGradientSample->rowCsrPtr, spGradientSample->colPtr ) ); +} diff --git a/code/cuda/RC-FINAL-5/subsampling_helpers.h b/code/cuda/RC-FINAL-5/subsampling_helpers.h new file mode 100644 index 0000000..8f67208 --- /dev/null +++ b/code/cuda/RC-FINAL-5/subsampling_helpers.h @@ -0,0 +1,20 @@ +#ifndef __SUB_SAMPLING_HELPERS_H__ +#define __SUB_SAMPLING_HELPERS_H__ + +#include "dataset.h" +#include "cuda_types.h" + +void initSubSampledHessian( int offset, int rows, SparseDataset *spSampledHessian, real *, int sampledSize ); +void prepareForNonUniformSampling (SparseDataset *samplingMat, int sampleSize, int *indices) ; + + +void prepareForSampling (SparseDataset *sampledHessian, real *, real *, int rows, int sampleSize, int *hostPtr); +void sampleDataset( SparseDataset *spSampledHessian, real *dataset, + int rows, int cols, int num_classes, + real *subSampledHessian, int sampleSize ); + +void sampleSparseDataset ( SparseDataset *spSampler, SparseDataset *spDataset, + int rows, int cols, int num_classes, + SparseDataset *spGradientSample, int sampleSize ); + +#endif diff --git a/code/cuda/RC-FINAL-5/utils.c b/code/cuda/RC-FINAL-5/utils.c new file mode 100644 index 0000000..0d64302 --- /dev/null +++ b/code/cuda/RC-FINAL-5/utils.c @@ -0,0 +1,33 @@ +#include +#include + +void allocate_memory( void **ptr, size_t s ) +{ + *ptr = malloc( s ); + if (*ptr == NULL){ + fprintf( stderr, "Memory Allocation failed for size: %u\n", s ); + } +} + +void release_memory( void **ptr ){ + free ( *ptr ); +} + +real Get_Time( ) +{ + struct timeval tim; + + gettimeofday(&tim, NULL ); + return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); +} + + +real Get_Timing_Info( real t_start ) +{ + struct timeval tim; + real t_end; + + gettimeofday(&tim, NULL ); + t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); + return (t_end - t_start); +} diff --git a/code/cuda/RC-FINAL-5/utils.h b/code/cuda/RC-FINAL-5/utils.h new file mode 100644 index 0000000..425a783 --- /dev/null +++ b/code/cuda/RC-FINAL-5/utils.h @@ -0,0 +1,12 @@ +#ifndef _H_UTILS__ +#define _H_UTILS__ + +#include + +void allocate_memory( void **, size_t); +void release_memory( void ** ); + + +real Get_Time (); +real Get_Timing_Info( real t_start ); +#endif diff --git a/code/tensorflow/cifar/tf_softmax.py b/code/tensorflow/cifar/tf_softmax.py new file mode 100644 index 0000000..22d86cd --- /dev/null +++ b/code/tensorflow/cifar/tf_softmax.py @@ -0,0 +1,306 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np + +import cPickle as pickle +#import pickle +import time +import StringIO + +import scipy.sparse as sparse + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + +curpath = sys.argv[2] + +#load the data here. +X_train = np.loadtxt(curpath + trainmat ,delimiter=',') +X_train = X_train.astype(np.float64) +Y_train = np.loadtxt(curpath + trainvec ,delimiter=',') +Y_train = Y_train.astype(np.float64) + +# Test +X_test = np.loadtxt(curpath + testmat, delimiter=',') +X_test = X_test.astype(np.float64) +Y_test = np.loadtxt(curpath + testvec, delimiter=',') +Y_teset = Y_test.astype(np.float64) + +# Convert to the usable format here. +print ("Done loading data..... ") +print (X_train.shape ) +print (Y_train.shape ) +print (X_test.shape ) +print (Y_test.shape ) + +print (len(X_train[0])) +print (len(X_train)) + + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +Y_one_hot=[] +for l in Y_train: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1])) +Y_train = np.array(Y_one_hot) + +Y_one_hot=[] +for l in Y_test: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1])) + +Y_test = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +# Network Parameters +n_input = X_train.shape[1] +n_classes = len( Y_train[0] ) + +## select specific element by index for each row +def sel_ele_2d(a,b): + b= tf.cast(b, tf.int32) + b_2 = tf.expand_dims(b, 1) + the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1) + ind = tf.concat([the_range, b_2],1) + res = tf.gather_nd(a, ind) + return res + + +# Create the network, tf variables and cost function here. +x = tf.placeholder("float64", [None, n_input]) +y = tf.placeholder("float64", [None, n_classes]) + +#W= tf.Variable(tf.random_normal([n_input, n_classes-1])) +W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64) + +Matrix_Mul= tf.matmul(x,W) +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64) + +## (y==c)*e^ +pre_temp= tf.multiply(T,tf.exp(Matrix_concat)) + +## 1+sigma(e^) +pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) ) + +## last-th class prob## +pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1) + +## get the first 6 classes prob +pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1]) + +## concat 6 classes prob with last class prob +pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1) +pred_labels_tf = tf.argmax(pred,1); + + +## Our cost function +cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) ) + +## Regularization Term Here. +#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.) +#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.) +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) + +prefix = '' +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1) + prefix += 'GPU' +else: + config = tf.ConfigProto(device_count={'GPU': 0} ) + prefix += 'CPU' + + +# Parameters +training_epochs = 100 +display_step = 1 +index = 1 + +# Parameters +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-12 + # this dataset is normalized from the source + ll = [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + #rterm = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4 ] + #rterm = [1e6] + rterm = [1] + prefix += '_raw_' + +else: + #lipschitz constant is 1e-3 + ll = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4 ] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + #rterm = [1e-6] + #rterm = [1e-4] + rterm = [1e-3] + prefix += '_norm_' + +for lmethod in [ sys.argv[1] ]: + + for r in rterm: + + final_cost = cost + r * regularization + + for learning_rate in ll: + + outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + + if True: + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(total_batch): + batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + #", cost=","{:.9f}".format(avg_cost), \ + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + #pred_labels = sess.run( pred_labels_tf, feed_dict={x: X_test, y: Y_test} ) + + #c = [0, 0, 0, 0, 0, 0, 0] + #for i in range(0, len(pred_labels)): + # c[ pred_labels[i] ] += 1 + + #for i in range(0, len(c)): + # print ("Class: %d --- > %d" % (i, c[ i ]) ) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close () diff --git a/code/tensorflow/covertype/tf_softmax.py b/code/tensorflow/covertype/tf_softmax.py new file mode 100644 index 0000000..b2d4aa6 --- /dev/null +++ b/code/tensorflow/covertype/tf_softmax.py @@ -0,0 +1,265 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np + +import cPickle as pickle +import time +import StringIO + +trainmat = 'train_forest_multi_features.txt' +trainvec = 'train_forest_multi_labels.txt' +testmat = 'test_forest_multi_features.txt' +testvec = 'test_forest_multi_labels.txt' + +curpath = sys.argv[2] + +#load the data here. +X_train = np.loadtxt(curpath + trainmat ,delimiter=',') +X_train = X_train.astype(np.float64) +Y_train = np.loadtxt(curpath + trainvec ,delimiter=',') +Y_train = Y_train.astype(np.float64) + +# Test +X_test = np.loadtxt(curpath + testmat, delimiter=',') +X_test = X_test.astype(np.float64) +Y_test = np.loadtxt(curpath + testvec, delimiter=',') +Y_teset = Y_test.astype(np.float64) + +print ("Done loading data..... ") + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +Y_one_hot=[] +for l in Y_train: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1])) +Y_train = np.array(Y_one_hot) + +Y_one_hot=[] +for l in Y_test: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1])) + +Y_test = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +# Network Parameters +n_input = X_train.shape[1] +n_classes = len( Y_train[0] ) + +## select specific element by index for each row +def sel_ele_2d(a,b): + b= tf.cast(b, tf.int32) + b_2 = tf.expand_dims(b, 1) + the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1) + ind = tf.concat([the_range, b_2],1) + res = tf.gather_nd(a, ind) + return res + + +# Create the network, tf variables and cost function here. +x = tf.placeholder("float64", [None, n_input]) +y = tf.placeholder("float64", [None, n_classes]) + +#W= tf.Variable(tf.random_normal([n_input, n_classes-1])) +W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64) + +Matrix_Mul= tf.matmul(x,W) +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64) + +## (y==c)*e^ +pre_temp= tf.multiply(T,tf.exp(Matrix_concat)) + +## 1+sigma(e^) +pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) ) + +## last-th class prob## +pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1) + +## get the first 6 classes prob +pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1]) + +## concat 6 classes prob with last class prob +pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1) +pred_labels_tf = tf.argmax(pred,1); + + +## Our cost function +cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) ) + +## Regularization Term Here. +#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.) +#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.) +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) + +prefix = '' + +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1 ); + prefix = 'GPU' + #config = tf.ConfigProto( ); +else: + config = tf.ConfigProto( device_count={'GPU': 0}) + prefix = 'CPU' + + +# Parameters +training_epochs = 100 +display_step = 1 +index = 1 + +# Parameters +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-13 + # this dataset is normalized from the source + ll = [1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2] + #rterm = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7] + #rterm = [1e-6] + rterm = [1] + prefix += '_raw_' + +else: + #lipschitz constant is 1.923 + ll = [1e-8, 1e-7,1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8 ] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + #rterm = [1e-8] + rterm = [1e-3] + + prefix += '_norm_' + + +for lmethod in [ sys.argv[1] ]: + for r in rterm: + final_cost = cost + r * regularization + for learning_rate in ll: + ''' + if sys.argv[2].find("raw-data") != -1: + outfile = open("raw_" + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + else: + outfile = open("norm_" + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + ''' + outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Path: " + sys.argv[2] + "\n") + outfile.write("BatchSize: " + str(batch_size) + "\n"); + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + sess.run(init) + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(total_batch): + batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y}) + # Compute average loss + avg_cost += c + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + outfile.close () diff --git a/code/tensorflow/diagnostics/tf_softmax.py b/code/tensorflow/diagnostics/tf_softmax.py new file mode 100644 index 0000000..fc5e837 --- /dev/null +++ b/code/tensorflow/diagnostics/tf_softmax.py @@ -0,0 +1,303 @@ +from __future__ import print_function + +import sys +import tensorflow as tf +import numpy as np +import math + +import cPickle as pickle +import time +import StringIO + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + +curpath = sys.argv[2] + +print () +print (curpath) +print () +print () + +#load the data here. +X_train = np.loadtxt(curpath + trainmat ,delimiter=',') +X_train = X_train.astype(np.float64) +Y_train = np.loadtxt(curpath + trainvec ,delimiter=',') +Y_train = Y_train.astype(np.float64) + +# Test +X_test = np.loadtxt(curpath + testmat, delimiter=',') +X_test = X_test.astype(np.float64) +Y_test = np.loadtxt(curpath + testvec, delimiter=',') +Y_teset = Y_test.astype(np.float64) + +print ("Done loading data..... ") + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +Y_one_hot=[] +for l in Y_train: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0])) + elif (l ==11): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1])) +Y_train = np.array(Y_one_hot) + +Y_one_hot=[] +for l in Y_test: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0])) + elif (l ==11): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1])) + +Y_test = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +# Network Parameters +n_input = X_train.shape[1] +n_classes = len( Y_train[0] ) + +## select specific element by index for each row +def sel_ele_2d(a,b): + b= tf.cast(b, tf.int32) + b_2 = tf.expand_dims(b, 1) + the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1) + ind = tf.concat([the_range, b_2],1) + res = tf.gather_nd(a, ind) + return res + + +# Create the network, tf variables and cost function here. +x = tf.placeholder("float64", [None, n_input]) +y = tf.placeholder("float64", [None, n_classes]) + +#W= tf.Variable(tf.random_normal([n_input, n_classes-1])) +W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64) + +Matrix_Mul= tf.matmul(x,W) +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64) + +## (y==c)*e^ +pre_temp= tf.multiply(T,tf.exp(Matrix_concat)) + +## 1+sigma(e^) +pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) ) + +## last-th class prob## +pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1) + +## get the first 6 classes prob +pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1]) + +## concat 6 classes prob with last class prob +pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1) +pred_labels_tf = tf.argmax(pred,1); + + +## Our cost function +cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) ) + +## Regularization Term Here. +#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.) +#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.) +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) + +prefix = '' + +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1 ); + prefix = 'GPU' + #config = tf.ConfigProto( ); +else: + config = tf.ConfigProto( device_count={'GPU': 0}) + prefix = 'CPU' + + +# Parameters +training_epochs = 100 +display_step = 1 +index = 0 + +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-7 + ll = [1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2] + #rlist = [1e-4] + rlist = [1] + prefix += '_raw_' +else: + #lipschitz constant is 1e-1 + ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4] + #rlist = [1e-7] + #rlist = [1e-6] + rlist = [1e-3] + prefix += '_norm_' + + +for lmethod in [ sys.argv[1] ]: + for r in rlist: + + final_cost = cost + r * regularization + + for learning_rate in ll: + + outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Normalization: " + sys.argv[2] + "\n") + outfile.write("Batch Size: "+ str( batch_size ) + "\n") + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + + if True: + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(total_batch): + batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + #", cost=","{:.9f}".format(avg_cost), \ + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + #pred_labels = sess.run( pred_labels_tf, feed_dict={x: X_test, y: Y_test} ) + + #c = [0, 0, 0, 0, 0, 0, 0] + #for i in range(0, len(pred_labels)): + # c[ pred_labels[i] ] += 1 + + #for i in range(0, len(c)): + # print ("Class: %d --- > %d" % (i, c[ i ]) ) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close () diff --git a/code/tensorflow/gisette/tf_logistic.py b/code/tensorflow/gisette/tf_logistic.py new file mode 100644 index 0000000..21b79b8 --- /dev/null +++ b/code/tensorflow/gisette/tf_logistic.py @@ -0,0 +1,251 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np + +import cPickle as pickle +import time +import StringIO + +trainmat = 'gisette_train.data' +trainvec = 'gisette_train.labels01' +testmat = 'gisette_valid.data' +testvec = 'gisette_valid.labels01' + +curpath = sys.argv[2] + +print () +print (curpath) +print () +print () + + +#load the data here. + +# Test +if sys.argv[2].find("raw-data") != -1: + X_train = np.genfromtxt( curpath + trainmat ,delimiter=' ', dtype=None) + X_train = X_train.astype(np.float64) + Y_train = np.genfromtxt( curpath + trainvec ,delimiter=' ', dtype=None) + Y_train = Y_train.astype(np.float64) + Y_train = Y_train.reshape( len(Y_train), 1); + X_test = np.genfromtxt( curpath + testmat, delimiter=' ', dtype=None) + X_test = X_test.astype(np.float64) + Y_test = np.genfromtxt( curpath + testvec, delimiter=' ', dtype=None) + Y_test = Y_test.astype(np.float64) + Y_test = Y_test.reshape( len( Y_test ), 1 ); +else: + X_train = np.genfromtxt( curpath + trainmat ,delimiter=',') + X_train = X_train.astype(np.float64) + Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',') + Y_train = Y_train.astype(np.float64) + Y_train = Y_train.reshape( len(Y_train), 1); + X_test = np.loadtxt( curpath + testmat, delimiter=',') + X_test = X_test.astype(np.float64) + Y_test = np.loadtxt( curpath + testvec, delimiter=',') + Y_test = Y_test.astype(np.float64) + Y_test = Y_test.reshape( len( Y_test ), 1 ); + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +print () +print () +print(X_train.shape) +print(Y_train.shape) +print(X_test.shape) +print(Y_test.shape) +print () +print () + + + +#for label in y: + +# if (label==0): +# Y_one_hot.append(np.array([1,0])) +# elif (label==1): +# Y_one_hot.append(np.array([0,1])) +#Y_one_hot = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +# Network Parameters +n_input = X_train.shape[1] +n_classes = 2 + +# Create the network, tf variables and cost function here. +x = tf.placeholder("float64", [None, n_input]) +y = tf.placeholder("float64", [None, n_classes - 1]) + +W= tf.Variable(tf.zeros([n_input, n_classes-1],dtype=tf.float64), dtype=tf.float64) + +Matrix_Mul= tf.matmul(x,W) +pred = tf.sigmoid(tf.matmul(x, W)) # predictions +scores = tf.matmul( x, W ); + +## to prevent overfitting ### +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +# Minimize error using cross entropy + + +## changed to this to prevent overflow +cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(y, scores))) + +## Regularization Term Here. +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) +prefix = '' +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1) + prefix += 'GPU' +else: + config = tf.ConfigProto(device_count={'GPU': 0} ) + prefix += 'CPU' + + +# Parameters +training_epochs = 100 +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + +display_step = 1 +index = 1 + + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-12 + ll = [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6] # based on lipschitz constant + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6] + #rterm = [1e3] + rterm = [1] + prefix += '_raw_' +else: + #lipschitz constant is 1e-3 + ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + #rterm = [1e-2] + rterm = [1e-3] + prefix += '_norm_' + + +for lmethod in [ sys.argv[1] ]: + + for r in rterm: + + final_cost = cost + r * regularization + + for learning_rate in ll: + + outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Path: " + sys.argv[2] + "\n") + outfile.write("BatchSize: " + str(batch_size) + "\n"); + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(y,1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) + + outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + #import pdb;pdb.set_trace(); + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(total_batch): + batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + + + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(y,1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) + outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close () diff --git a/code/tensorflow/ijcnn1/tf_logistic.py b/code/tensorflow/ijcnn1/tf_logistic.py new file mode 100644 index 0000000..b61c6ab --- /dev/null +++ b/code/tensorflow/ijcnn1/tf_logistic.py @@ -0,0 +1,244 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np +import csv + +import cPickle as pickle +import time +import StringIO + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + +curpath = sys.argv[2] + +print () +print (curpath) +print () +print () + + +# Test +if sys.argv[2].find("raw-data") != -1: + X_train = np.genfromtxt( curpath + trainmat ,delimiter=',',dtype=None ) + X_train = X_train.astype(np.float64) + Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',',dtype=None) + Y_train = Y_train.astype(np.float64) + Y_train = Y_train.reshape( len(Y_train), 1); + X_test = np.genfromtxt( curpath + testmat, delimiter=',') + X_test = X_test.astype(np.float64) + Y_test = np.genfromtxt( curpath + testvec, delimiter=',') + Y_test = Y_test.astype(np.float64) + Y_test = Y_test.reshape( len( Y_test ), 1 ); +else: + X_train = np.genfromtxt( curpath + trainmat ,delimiter=',') + X_train = X_train.astype(np.float64) + Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',') + Y_train = Y_train.astype(np.float64) + Y_train = Y_train.reshape( len(Y_train), 1); + X_test = np.loadtxt( curpath + testmat, delimiter=',') + Y_test = np.loadtxt( curpath + testvec, delimiter=',') + Y_test = Y_test.reshape( len( Y_test ), 1 ); + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +print () +print () +print(X_train.shape) +print(Y_train.shape) +print(X_test.shape) +print(Y_test.shape) +print () + + + +#for label in y: + +# if (label==0): +# Y_one_hot.append(np.array([1,0])) +# elif (label==1): +# Y_one_hot.append(np.array([0,1])) +#Y_one_hot = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +# Network Parameters +n_input = X_train.shape[1] +n_classes = 2 + +# Create the network, tf variables and cost function here. +x = tf.placeholder("float64", [None, n_input]) +y = tf.placeholder("float64", [None, n_classes - 1]) + +W= tf.Variable(tf.zeros([n_input, n_classes-1])) + +Matrix_Mul= tf.matmul(x,W) +pred = tf.sigmoid(tf.matmul(x, W)) # predictions +scores = tf.matmul( x, W ); + +## to prevent overfitting ### +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +# Minimize error using cross entropy + + +## changed to this to prevent overflow +cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(y, scores))) + +## Regularization Term Here. +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) +#config = tf.ConfigProto( device_count={'GPU': 0}) +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1 ); + #config = tf.ConfigProto( ); +else: + config = tf.ConfigProto( device_count={'GPU': 0}) + + +# Parameters +training_epochs = 100 +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + +display_step = 1 +index = 1 + + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-4 + ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # based on lipschitz constant + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6] + #rterm = [1e-2] + rterm = [1e-6] +else: + #lipschitz constant is 10 + ll = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + rterm = [1e-7] + + +for lmethod in [ sys.argv[1] ]: + + for r in rterm: + + final_cost = cost + r * regularization + + for learning_rate in ll: + + if sys.argv[2].find("raw-data") != -1: + outfile = open("raw_" + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + else: + outfile = open("norm_" + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Path: " + sys.argv[2] + "\n") + outfile.write("BatchSize: " + str(batch_size) + "\n"); + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(y,1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) + + outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + #import pdb;pdb.set_trace(); + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(total_batch): + batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + + + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(y,1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) + outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close () diff --git a/code/tensorflow/mnist/tf_softmax.py b/code/tensorflow/mnist/tf_softmax.py new file mode 100644 index 0000000..559be11 --- /dev/null +++ b/code/tensorflow/mnist/tf_softmax.py @@ -0,0 +1,274 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np + +import cPickle as pickle +import time +import StringIO + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + +curpath = sys.argv[2] + +#load the data here. +X_train = np.loadtxt(curpath + trainmat ,delimiter=',') +X_train = X_train.astype(np.float64) +Y_train = np.loadtxt(curpath + trainvec ,delimiter=',') +Y_train = Y_train.astype(np.float64) + +Y_train += 1 + +# Test +X_test = np.loadtxt(curpath + testmat, delimiter=',') +X_test = X_test.astype(np.float64) +Y_test = np.loadtxt(curpath + testvec, delimiter=',') +Y_test = Y_test.astype(np.float64) + +Y_test += 1 + +print ("Done loading data..... ") + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +Y_one_hot=[] +for l in Y_train: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1])) +Y_train = np.array(Y_one_hot) + +Y_one_hot=[] +for l in Y_test: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1])) +Y_test = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +# Network Parameters +n_input = X_train.shape[1] +n_classes = len( Y_train[0] ) + +## select specific element by index for each row +def sel_ele_2d(a,b): + b= tf.cast(b, tf.int32) + b_2 = tf.expand_dims(b, 1) + the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1) + ind = tf.concat([the_range, b_2],1) + res = tf.gather_nd(a, ind) + return res + + +# Create the network, tf variables and cost function here. +x = tf.placeholder("float64", [None, n_input]) +y = tf.placeholder("float64", [None, n_classes]) + +#W= tf.Variable(tf.random_normal([n_input, n_classes-1])) +W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64) + +Matrix_Mul= tf.matmul(x,W) +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64) + +## (y==c)*e^ +pre_temp= tf.multiply(T,tf.exp(Matrix_concat)) + +## 1+sigma(e^) +pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) ) + +## last-th class prob## +pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1) + +## get the first 6 classes prob +pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1]) + +## concat 6 classes prob with last class prob +pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1) +pred_labels_tf = tf.argmax(pred,1); + + +## Our cost function +cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(y,1) ),1) ) + +## Regularization Term Here. +#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.) +#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.) +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) +# config = tf.ConfigProto( intra_op_parallelism_threads=1, device_count={'GPU': 1}) + +prefix = '' +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1) + prefix += 'GPU' +else: + config = tf.ConfigProto(device_count={'GPU': 0} ) + prefix += 'CPU' + + +# Parameters +training_epochs = 100 +display_step = 1 +index = 1 + +# Parameters +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-11 + # this dataset is normalized from the source + ll = [1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e1, 1e2, 1e3 ] + #rterm = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4] + #rterm = [1e1] + #rterm = [1e2] + rterm = [1] + prefix += '_raw_' + +else: + #lipschitz constant is 1e-2 + ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6 ] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + #rterm = [1e-4] + rterm = [1e-3] + prefix += '_norm_' + + + +for lmethod in [ sys.argv[1] ]: + for r in rterm: + final_cost = cost + r * regularization + for learning_rate in ll: + outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Path: " + sys.argv[2] + "\n") + outfile.write("BatchSize: " + str(batch_size) + "\n"); + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + sess.run(init) + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(total_batch): + batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x,y: batch_y}) + # Compute average loss + avg_cost += c + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(y,1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + outfile.close () diff --git a/code/tensorflow/newsgroups/tf_softmax.py b/code/tensorflow/newsgroups/tf_softmax.py new file mode 100644 index 0000000..24cdf15 --- /dev/null +++ b/code/tensorflow/newsgroups/tf_softmax.py @@ -0,0 +1,433 @@ +from __future__ import print_function + +import sys +import tensorflow as tf +import numpy as np + +import cPickle as pickle +import time +import StringIO + +import scipy.sparse as sparse +import math + +def getShape( entries ): + + mrow = 0 + mcol = 0 + + for item in entries: + if mrow < item[0]: + mrow = item[0] + if mcol < item[1]: + mcol = item[1] + return mrow, mcol + +def getDenseMatrix(entries, rows, cols): + rowIdx = np.empty([len(entries)], dtype=int) + colIdx = np.empty([len(entries)], dtype=int) + val = np.empty([len(entries)], dtype=np.float64) + + for idx,item in enumerate(entries): + rowIdx[ idx ] = item[0] - 1 + colIdx[ idx ] = item[1] - 1 + val[ idx ] = item[2] + + return sparse.csr_matrix( (val, (rowIdx, colIdx)), shape=(rows, cols) ).toarray () + + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + + +#load the data here. +curpath = sys.argv[2] + +print () +print (curpath) +print () +print () + + +X_train = np.loadtxt(curpath + trainmat ,delimiter=',') +X_train = X_train.astype(np.float64) +Y_train = np.loadtxt(curpath + trainvec ,delimiter=',') +Y_train = Y_train.astype(np.float64) + +# Test +X_test = np.loadtxt(curpath + testmat, delimiter=',') +X_test = X_test.astype(np.float64) +Y_test = np.loadtxt(curpath + testvec, delimiter=',') +Y_test = Y_test.astype(np.float64) + +# Convert to the usable format here. + +x_row, x_col = getShape( X_train ) +y_row, y_col = getShape( X_test ) + +shapey = 0 + +if x_col < y_col: + shapey = y_col +else: + shapey = x_col + +print ("Done loading data..... ") +print (X_train.shape) +print (Y_train.shape) +print (X_test.shape) +print (Y_test.shape) +print (Y_test) +print (Y_train) + + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +Y_one_hot=[] +for l in Y_train: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0])) + if (l == 11): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0])) + elif (l ==12): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0])) + elif (l ==13): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0])) + elif (l ==14): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0])) + elif (l ==15): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0])) + elif (l ==16): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0])) + elif (l ==17): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0])) + elif (l ==18): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0])) + elif (l ==19): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0])) + elif (l ==20): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1])) +Y_train = np.array(Y_one_hot) + +Y_one_hot=[] +for l in Y_test: + if (l == 1): + Y_one_hot.append(np.array([1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==2): + Y_one_hot.append(np.array([0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==3): + Y_one_hot.append(np.array([0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==4): + Y_one_hot.append(np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==5): + Y_one_hot.append(np.array([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==6): + Y_one_hot.append(np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==7): + Y_one_hot.append(np.array([0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==8): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==9): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0])) + elif (l ==10): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0])) + if (l == 11): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0])) + elif (l ==12): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0])) + elif (l ==13): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0])) + elif (l ==14): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0])) + elif (l ==15): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0])) + elif (l ==16): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0])) + elif (l ==17): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0])) + elif (l ==18): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0])) + elif (l ==19): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0])) + elif (l ==20): + Y_one_hot.append(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1])) +Y_test = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) +## shuffle data points ## +def shuffle(X,Y): + idx = np.random.randint(len(X), size=len(X)) + X = X[idx,:] + Y = Y[idx] + return (X,Y) + +X_train = getDenseMatrix( X_train, x_row, shapey ) +X_test = getDenseMatrix( X_test, y_row, shapey ) + +X_train,Y_train=shuffle(X_train,Y_train) + +#import pdb; pdb.set_trace(); +# Network Parameters +n_input = X_train.shape[1] +n_classes = len( Y_train[0] ) + +## select specific element by index for each row +def sel_ele_2d(a,b): + b= tf.cast(b, tf.int32) + b_2 = tf.expand_dims(b, 1) + the_range = tf.expand_dims(tf.range(tf.shape(b)[0]), 1) + ind = tf.concat([the_range, b_2],1) + res = tf.gather_nd(a, ind) + return res + + +# Create the network, tf variables and cost function here. +#x = tf.placeholder("float", [None, n_input]) +#y = tf.placeholder("float", [None, n_classes]) + +x=tf.sparse_placeholder(tf.float64) +y=tf.sparse_placeholder(tf.float64) +W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64) + +#Matrix_Mul= tf.matmul(x,W) +Matrix_Mul= tf.sparse_tensor_dense_matmul(x,W) +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +T= tf.one_hot((n_classes-1)*tf.ones([tf.shape(x)[0]],tf.int64) ,depth=n_classes,on_value=np.float64(0.0),off_value=np.float64(1.0),dtype=tf.float64) + +## (y==c)*e^ +pre_temp= tf.multiply(T,tf.exp(Matrix_concat)) + +## 1+sigma(e^) +pre_temp1 = tf.add( np.float64(1.0),tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul),1 ),1) ) + +## last-th class prob## +pre_temp2 = np.float64(1.0) - tf.reduce_sum(tf.div(pre_temp,pre_temp1),1) + +## get the first 6 classes prob +pre_temp3 = tf.slice( tf.div(pre_temp,pre_temp1) ,[0,0],[tf.shape(x)[0],n_classes-1]) + +## concat 6 classes prob with last class prob +pred = tf.concat([pre_temp3,tf.expand_dims(pre_temp2,1)],1) +pred_labels_tf = tf.argmax(pred,1); + + +## Our cost function +cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(tf.sparse_tensor_to_dense(y),1) ),1) ) + +## Regularization Term Here. +#regularization = (float(sys.argv[2]) / 2.0) * tf.pow(tf.norm( W, ord='euclidean' ), 2.) +#regularization = tf.pow(tf.norm( W, ord='euclidean' ), 2.) +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) + +prefix = '' +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1) + prefix += 'GPU' +else: + config = tf.ConfigProto(device_count={'GPU': 0} ) + prefix += 'CPU' + + +# Parameters +training_epochs = 100 +display_step = 1 +index = 0 + +# Parameters +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + + +with tf.Session() as sess: + ### creating sparse_tensor for test x, test y ### + + x_t= tf.placeholder("float64", [None, n_input]) + #x_t = tf.constant(X_test) + idx_x = tf.where(tf.not_equal(x_t, 0)) + sparse_Test_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t, idx_x), tf.cast(tf.shape(x_t),tf.int64)) + + #sparse_X_test=sess.run([sparse_Test_x],feed_dict={x_t:X_test}) + #import pdb;pdb.set_trace(); + y_t= tf.placeholder("float64", [None, n_classes]) + + #y_t = tf.constant(Y_test) + idx_y = tf.where(tf.not_equal(y_t, 0)) + sparse_Test_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t, idx_y),tf.cast(tf.shape(y_t),tf.int64)) + ### creating batch list ### + x_t2 = tf.placeholder("float64", [None, n_input]) #tf.constant(X_train) + idx_x = tf.where(tf.not_equal(x_t2, 0)) + sparse_Train_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t2, idx_x), tf.cast(tf.shape(x_t2),tf.int64) ) + sparse_Train_x_list=tf.sparse_split(sp_input=sparse_Train_x,axis=0,num_split=int (np.floor( len(X_train)/batch_size ))) + y_t2 = tf.placeholder("float64", [None, n_classes]) #tf.constant(Y_train) + idx_y = tf.where(tf.not_equal(y_t2, 0)) + sparse_Train_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t2, idx_y), tf.cast(tf.shape(y_t2),tf.int64)) + sparse_Train_y_list=tf.sparse_split(sp_input=sparse_Train_y,axis=0,num_split=int (np.floor( len(X_train)/batch_size ))) + + X_test,Y_test,X_train,Y_train,batch_x_list,batch_y_list = sess.run([sparse_Test_x,sparse_Test_y,sparse_Train_x,sparse_Train_y,sparse_Train_x_list,sparse_Train_y_list ],feed_dict={x_t: X_test, y_t: Y_test,x_t2: X_train,y_t2: Y_train}) + + +if sys.argv[2].find("raw-data") != -1: + #raw + ll = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2 ] + #rlist = [1e-3] + rlist = [1] + prefix += '_raw_' +else: + #normalized + ll = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2 ] + #rlist = [1e-1] + rlist = [1e-3] + prefix += '_norm_' + #print ("This is NOT Working at the moment..... ") + #exit () + +for lmethod in [ sys.argv[1] ]: + for r in rlist: + final_cost = cost + r * regularization + + for learning_rate in ll: + + outfile = open(prefix + lmethod +"_" + str(index) + "_readings.txt", "w", 0) + index += 1 + + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Normalization: " + sys.argv[2] +"\n") + outfile.write("Batch Size: "+ str(batch_size) +"\n") + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + + if True: + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(tf.sparse_tensor_to_dense(y),1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(len(batch_x_list)): + #batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x_list[i],y: batch_y_list[i]}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + correct_prediction = tf.equal( tf.argmax(pred,1), tf.argmax(tf.sparse_tensor_to_dense(y),1) ) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float64")) + #", cost=","{:.9f}".format(avg_cost), \ + outfile.write("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.2f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + #pred_labels = sess.run( pred_labels_tf, feed_dict={x: X_test, y: Y_test} ) + + #c = [0, 0, 0, 0, 0, 0, 0] + #for i in range(0, len(pred_labels)): + # c[ pred_labels[i] ] += 1 + + #for i in range(0, len(c)): + # print ("Class: %d --- > %d" % (i, c[ i ]) ) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close () diff --git a/code/tensorflow/rcv1/tf_logistic.py b/code/tensorflow/rcv1/tf_logistic.py new file mode 100644 index 0000000..f710191 --- /dev/null +++ b/code/tensorflow/rcv1/tf_logistic.py @@ -0,0 +1,316 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np + +import scipy.sparse as sparse + +import cPickle as pickle +import time +import StringIO + +def getShape( entries ): + mrow = 0 + mcol = 0 + for item in entries: + if mrow < item[0]: + mrow = item[0] + if mcol < item[1]: + mcol = item[1] + return mrow, mcol + +def getDenseMatrix(entries, rows, cols): + rowIdx = np.empty([len(entries)], dtype=int) + colIdx = np.empty([len(entries)], dtype=int) + val = np.empty([len(entries)], dtype=float) + + print( rows ); + print( cols ); + + for idx,item in enumerate(entries): + rowIdx[ idx ] = item[0] - 1 + colIdx[ idx ] = item[1] - 1 + val[ idx ] = item[2] + return sparse.csr_matrix( (val, (rowIdx, colIdx)), shape=(rows, cols) ).toarray () + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + +curpath = sys.argv[2] + +print () +print (curpath) +print () +print () + +#load the data here. +X_train = np.genfromtxt( curpath + trainmat ,delimiter=',', dtype=None) +Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',', dtype=None) +Y_train = Y_train.astype(np.float64) +Y_train = Y_train.reshape( len(Y_train), 1); + +# Test +X_test = np.genfromtxt( curpath + testmat, delimiter=',', dtype=None) +Y_test = np.genfromtxt( curpath + testvec, delimiter=',', dtype=None) +Y_test = Y_test.astype(np.float64) +Y_test = Y_test.reshape( len( Y_test ), 1 ); + +# Convert to the usable format here. + +x_row, x_col = getShape( X_train ) +y_row, y_col = getShape( X_test ) + +shapey = 0 + +if x_col < y_col: + shapey = y_col +else: + shapey = x_col + + +X_train = getDenseMatrix( X_train, x_row, shapey ) +X_test = getDenseMatrix( X_test, y_row, shapey ) + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +print () +print () +print(X_train.shape) +print(Y_train.shape) +print(X_test.shape) +print(Y_test.shape) +print () +print () + + + +#for label in y: + +# if (label==0): +# Y_one_hot.append(np.array([1,0])) +# elif (label==1): +# Y_one_hot.append(np.array([0,1])) +#Y_one_hot = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +## shuffle data points ## +def shuffle(X,Y): + idx = np.random.randint(len(X), size=len(X)) + X = X[idx,:] + Y = Y[idx] + return (X,Y) + +X_train,Y_train=shuffle(X_train,Y_train) + + + +# Network Parameters +n_input = X_train.shape[1] +n_classes = 2 + +# Create the network, tf variables and cost function here. +#x = tf.placeholder("float", [None, n_input]) +#y = tf.placeholder("float", [None, n_classes - 1]) + +x=tf.sparse_placeholder(tf.float32) +y=tf.sparse_placeholder(tf.float32) + + + + +W= tf.Variable(tf.zeros([n_input, n_classes-1])) + + +Matrix_Mul= tf.sparse_tensor_dense_matmul(x,W) +pred = tf.sigmoid(tf.sparse_tensor_dense_matmul(x, W)) # predictions +scores = tf.sparse_tensor_dense_matmul( x, W ); + +#Matrix_Mul= tf.matmul(x,W) +#pred = tf.sigmoid(tf.matmul(x, W)) # predictions +#scores = tf.matmul( x, W ); + +## to prevent overfitting ### +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float32) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +# Minimize error using cross entropy + + +## changed to this to prevent overflow +cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(tf.sparse_tensor_to_dense(y), scores))) + +#cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(tf.sparse_tensor_to_dense(y),1) ),1) ) +## Regularization Term Here. +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) +config = tf.ConfigProto( device_count={'GPU': 0}) + + +# Parameters +training_epochs = 100 +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + +display_step = 1 +index = 1 + + +with tf.Session() as sess: + + x_t= tf.placeholder("float", [None, n_input]) + #x_t = tf.constant(X_test) + idx_x = tf.where(tf.not_equal(x_t, 0)) + sparse_Test_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t, idx_x), tf.cast(tf.shape(x_t),tf.int64)) + + #sparse_X_test=sess.run([sparse_Test_x],feed_dict={x_t:X_test}) + #import pdb;pdb.set_trace(); + y_t= tf.placeholder("float", [None, n_classes-1]) + + #y_t = tf.constant(Y_test) + idx_y = tf.where(tf.not_equal(y_t, 0)) + sparse_Test_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t, idx_y),tf.cast(tf.shape(y_t),tf.int64)) + ### creating batch list ### + x_t2 = tf.placeholder("float", [None, n_input]) #tf.constant(X_train) + idx_x = tf.where(tf.not_equal(x_t2, 0)) + sparse_Train_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t2, idx_x), tf.cast(tf.shape(x_t2),tf.int64) ) + sparse_Train_x_list=tf.sparse_split(sp_input=sparse_Train_x,axis=0,num_split=int (np.floor( len(X_train)/batch_size ))) + y_t2 = tf.placeholder("float", [None, n_classes-1]) #tf.constant(Y_train) + idx_y = tf.where(tf.not_equal(y_t2, 0)) + sparse_Train_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t2, idx_y), tf.cast(tf.shape(y_t2),tf.int64)) + sparse_Train_y_list=tf.sparse_split(sp_input=sparse_Train_y,axis=0,num_split=int (np.floor( len(X_train)/batch_size ))) + + X_test,Y_test,X_train,Y_train,batch_x_list,batch_y_list = sess.run([sparse_Test_x,sparse_Test_y,sparse_Train_x,sparse_Train_y,sparse_Train_x_list,sparse_Train_y_list ],feed_dict={x_t: X_test, y_t: Y_test,x_t2: X_train,y_t2: Y_train}) + +#import pdb; pdb.set_trace(); + + + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-12 + ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] + rterm = [1e-1] +else: + #lipschitz constant is 1e-3 + ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] + rterm = [1e-2] + + +for lmethod in [ sys.argv[1] ]: + + for r in rterm: + + final_cost = cost + r * regularization + + for learning_rate in ll: + + outfile = open(lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Path: " + sys.argv[2] + "\n") + outfile.write("BatchSize: " + str(batch_size) + "\n"); + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float')) + + outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + #import pdb;pdb.set_trace(); + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + #total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(len(batch_x_list )): + #batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x_list[i],y: batch_y_list[i]}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + + + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float')) + outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close () diff --git a/code/tensorflow/real-sim/tf_logistic.py b/code/tensorflow/real-sim/tf_logistic.py new file mode 100644 index 0000000..8720f61 --- /dev/null +++ b/code/tensorflow/real-sim/tf_logistic.py @@ -0,0 +1,332 @@ +from __future__ import print_function + +import sys +import math +import tensorflow as tf +import numpy as np + +import scipy.sparse as sparse + +import cPickle as pickle +import time +import StringIO + +def getShape( entries ): + mrow = 0 + mcol = 0 + for item in entries: + if mrow < item[0]: + mrow = item[0] + if mcol < item[1]: + mcol = item[1] + return mrow, mcol + +def getDenseMatrix(entries, rows, cols): + rowIdx = np.empty([len(entries)], dtype=int) + colIdx = np.empty([len(entries)], dtype=int) + val = np.empty([len(entries)], dtype=np.float64) + + print( rows ); + print( cols ); + + for idx,item in enumerate(entries): + rowIdx[ idx ] = item[0] - 1 + colIdx[ idx ] = item[1] - 1 + val[ idx ] = item[2] + return sparse.csr_matrix( (val, (rowIdx, colIdx)), shape=(rows, cols) ).toarray () + +trainmat = 'train_mat.txt' +trainvec = 'train_vec.txt' +testmat = 'test_mat.txt' +testvec = 'test_vec.txt' + +curpath = sys.argv[2] + +print () +print (curpath) +print () +print () + +#load the data here. +X_train = np.genfromtxt( curpath + trainmat ,delimiter=',', dtype=None) +Y_train = np.genfromtxt( curpath + trainvec ,delimiter=',', dtype=None) +Y_train = Y_train.astype(np.float64) +Y_train = Y_train.reshape( len(Y_train), 1); + +# Test +X_test = np.genfromtxt( curpath + testmat, delimiter=',', dtype=None) +Y_test = np.genfromtxt( curpath + testvec, delimiter=',', dtype=None) +Y_test = Y_test.astype(np.float64) +Y_test = Y_test.reshape( len( Y_test ), 1 ); + +# Convert to the usable format here. + +x_row, x_col = getShape( X_train ) +y_row, y_col = getShape( X_test ) + +shapey = 0 + +if x_col < y_col: + shapey = y_col +else: + shapey = x_col + + +X_train = getDenseMatrix( X_train, x_row, shapey ) +X_test = getDenseMatrix( X_test, y_row, shapey ) + +# fix random seed for reproducibility +seed = 7 +np.random.seed(seed) + +print () +print () +print(X_train.shape) +print(Y_train.shape) +print(X_test.shape) +print(Y_test.shape) +print () +print () + + + +#for label in y: + +# if (label==0): +# Y_one_hot.append(np.array([1,0])) +# elif (label==1): +# Y_one_hot.append(np.array([0,1])) +#Y_one_hot = np.array(Y_one_hot) + + + +## get BATCH_SIZE data points ## +def get_batch(X,Y): + idx = np.random.randint(len(X), size=batch_size) + batch_X= X[idx,:] + batch_Y = Y[idx] + return (batch_X,batch_Y) + +## shuffle data points ## +def shuffle(X,Y): + idx = np.random.randint(len(X), size=len(X)) + X = X[idx,:] + Y = Y[idx] + return (X,Y) + +X_train,Y_train=shuffle(X_train,Y_train) + + + +# Network Parameters +n_input = X_train.shape[1] +n_classes = 2 + +# Create the network, tf variables and cost function here. +#x = tf.placeholder("float", [None, n_input]) +#y = tf.placeholder("float", [None, n_classes - 1]) + +x=tf.sparse_placeholder(tf.float64) +y=tf.sparse_placeholder(tf.float64) + + + + +W= tf.Variable(tf.zeros([n_input, n_classes-1], dtype=tf.float64), dtype=tf.float64) + + +Matrix_Mul= tf.sparse_tensor_dense_matmul(x,W) +pred = tf.sigmoid(tf.sparse_tensor_dense_matmul(x, W)) # predictions +scores = tf.sparse_tensor_dense_matmul( x, W ); + +#Matrix_Mul= tf.matmul(x,W) +#pred = tf.sigmoid(tf.matmul(x, W)) # predictions +#scores = tf.matmul( x, W ); + +## to prevent overfitting ### +Zeros = tf.zeros([ tf.shape(x)[0], 1 ],tf.float64) +Matrix_concat= tf.concat([Matrix_Mul, Zeros], 1) +Mx =tf.expand_dims( tf.reduce_max( Matrix_concat, reduction_indices=[1]) ,1) +Ax = tf.add( tf.exp(-Mx), tf.expand_dims( tf.reduce_sum( tf.exp(Matrix_Mul-Mx) ,1),1) ) + +# Minimize error using cross entropy + + +## changed to this to prevent overflow +cost = tf.reduce_sum(tf.subtract( Mx+tf.log(Ax), tf.multiply(tf.sparse_tensor_to_dense(y), scores))) + +#cost = tf.reduce_sum ( (Mx+tf.log(Ax)) - tf.expand_dims( sel_ele_2d( Matrix_concat , tf.argmax(tf.sparse_tensor_to_dense(y),1) ),1) ) +## Regularization Term Here. +regularization = tf.nn.l2_loss(W) + + +# Tensorflow built in cost function +#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) +prefix = '' +if sys.argv[4] == 'GPU': + config = tf.ConfigProto( intra_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + prefix += 'GPU' +else: + config = tf.ConfigProto(device_count={'GPU': 0} ) + prefix += 'CPU' + + +# Parameters +training_epochs = 100 +if sys.argv[3] == 'fixed' : + batch_size = 128 +else: + batch_size = int (math.floor( len(X_train) * 0.2 )) + +display_step = 1 +index = 1 + + +with tf.Session(config=config) as sess: + + x_t= tf.placeholder("float64", [None, n_input]) + #x_t = tf.constant(X_test) + idx_x = tf.where(tf.not_equal(x_t, 0)) + sparse_Test_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t, idx_x), tf.cast(tf.shape(x_t),tf.int64)) + + #sparse_X_test=sess.run([sparse_Test_x],feed_dict={x_t:X_test}) + #import pdb;pdb.set_trace(); + y_t= tf.placeholder("float64", [None, n_classes-1]) + + #y_t = tf.constant(Y_test) + idx_y = tf.where(tf.not_equal(y_t, 0)) + sparse_Test_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t, idx_y),tf.cast(tf.shape(y_t),tf.int64)) + ### creating batch list ### + x_t2 = tf.placeholder("float64", [None, n_input]) #tf.constant(X_train) + idx_x = tf.where(tf.not_equal(x_t2, 0)) + sparse_Train_x = tf.SparseTensor(idx_x, tf.gather_nd(x_t2, idx_x), tf.cast(tf.shape(x_t2),tf.int64) ) + sparse_Train_x_list=tf.sparse_split(sp_input=sparse_Train_x,axis=0,num_split=int (np.floor( len(X_train)/batch_size ))) + y_t2 = tf.placeholder("float64", [None, n_classes-1]) #tf.constant(Y_train) + idx_y = tf.where(tf.not_equal(y_t2, 0)) + sparse_Train_y = tf.SparseTensor(idx_y, tf.gather_nd(y_t2, idx_y), tf.cast(tf.shape(y_t2),tf.int64)) + sparse_Train_y_list=tf.sparse_split(sp_input=sparse_Train_y,axis=0,num_split=int (np.floor( len(X_train)/batch_size ))) + + X_test,Y_test,X_train,Y_train,batch_x_list,batch_y_list = sess.run([sparse_Test_x,sparse_Test_y,sparse_Train_x,sparse_Train_y,sparse_Train_x_list,sparse_Train_y_list ],feed_dict={x_t: X_test, y_t: Y_test,x_t2: X_train,y_t2: Y_train}) + +#import pdb; pdb.set_trace(); + + + + +if sys.argv[2].find("raw-data") != -1: + #lipschitz constant is 1e-12 + # this dataset is normalized from the source + #no need to run the raw-data set of runs. + #ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] +# print 'This is NOT DEFINED FOR THIS DATASET... ' + exit () +else: + #lipschitz constant is 1e-3 + ll = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] + #rterm = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + #rterm = [1e-2] + rterm = [1e-3] + prefix += '_norm_' + + +for lmethod in [ sys.argv[1] ]: + + for r in rterm: + + final_cost = cost + r * regularization + + for learning_rate in ll: + + outfile = open(prefix + lmethod + "_" + str(index) + "_readings.txt", "w", 0) + index += 1 + outfile.write("------------------------------------------\n") + outfile.write("Method: " + lmethod + "\n") + outfile.write("Step Length: " + str(learning_rate) + "\n") + outfile.write("Regularization: " + str(r) + "\n") + outfile.write("Path: " + sys.argv[2] + "\n") + outfile.write("BatchSize: " + str(batch_size) + "\n"); + + outfile.write("Begin simulation ...... \n"); + + if(lmethod =="GD"): + optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(final_cost) + elif(lmethod =="Adadelta"): + optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta').minimize(final_cost) + elif(lmethod =="Adagrad"): + optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad').minimize(final_cost) + elif(lmethod =="Adam"): + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam').minimize(final_cost) + elif(lmethod =="RMSProp"): + optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp').minimize(final_cost) + elif(lmethod =="Momentum"): + optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=False, name='Momentum', use_nesterov=False).minimize(final_cost) + + # Initializing the variables + init = tf.global_variables_initializer() + + # Launch the graph + with tf.Session(config=config) as sess: + #with tf.Session() as sess: + sess.run(init) + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) + + outfile.write ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((0), \ + (0), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + #import pdb;pdb.set_trace(); + # Training cycle + for epoch in range(training_epochs): + avg_cost = 0. + #total_batch = int(len(X_train)/batch_size) + # Loop over all batches + + start_time = time.time() + for i in range(len(batch_x_list )): + #batch_x, batch_y = get_batch(X_train,Y_train) + # Run optimization op (backprop) and cost op (to get loss value) + _, c = sess.run([optimizer,final_cost], feed_dict={x: batch_x_list[i],y: batch_y_list[i]}) + # Compute average loss + avg_cost += c + + end_time = time.time () + + # Display logs per epoch step + if epoch % display_step == 0: + # Test model + # Calculate accuracy + + + ### thresholding , if >0.5 , TRUE, else FALSE + predicted_class = tf.greater(pred,0.5) + correct = tf.equal(predicted_class, tf.equal(tf.sparse_tensor_to_dense(y),1.0)) + accuracy = tf.reduce_mean( tf.cast(correct, 'float64')) + outfile.write("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + print ("%3d:%4.3f:%3.2f:%3.2f:%e:%e\n" % \ + ((epoch+1), \ + (end_time-start_time), \ + accuracy.eval({x:X_train, y:Y_train})*100., \ + accuracy.eval({x: X_test, y: Y_test})*100., \ + sess.run(final_cost, feed_dict={x: X_train, y: Y_train}), \ + sess.run(final_cost, feed_dict={x: X_test, y: Y_test}))) + + outfile.write("End of Simulation Here..... \n") + outfile.write("\n"); + outfile.write("\n"); + outfile.write("\n"); + + outfile.close ()