From 1674687aee0877eb7720025c8d3eafd6a0c0ba99 Mon Sep 17 00:00:00 2001 From: Andrew Palumbo Date: Sat, 6 May 2017 20:43:46 -0700 Subject: [PATCH] WIP: Dense --- cuda/pom.xml | 8 +- .../org/apache/mahout/cuda/Context.scala | 10 +- .../apache/mahout/cuda/DenseRowMatrix.scala | 87 ++++++++++-- .../org/apache/mahout/cuda/package.scala | 128 +++++++++++------- 4 files changed, 168 insertions(+), 65 deletions(-) diff --git a/cuda/pom.xml b/cuda/pom.xml index 4eee3f2a24..c5e1aef716 100644 --- a/cuda/pom.xml +++ b/cuda/pom.xml @@ -24,7 +24,7 @@ org.apache.mahout mahout - 0.13.0-SNAPSHOT + 0.13.1-SNAPSHOT ../pom.xml @@ -175,12 +175,6 @@ scalatest_${scala.compat.version} - - org.bytedeco - javacpp - 1.2.4 - - org.jcuda jcuda diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala b/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala index f35f70e251..69ea3ab0d0 100644 --- a/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala +++ b/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala @@ -23,6 +23,10 @@ import jcuda.jcusparse.JCusparse._ import jcuda.jcusparse._ import jcuda.runtime.JCuda +import jcuda._ +import jcublas._ +import JCublas._ + final class Context { // Enable exceptions for all CUDA libraries JCuda.setExceptionsEnabled(true) @@ -33,8 +37,10 @@ final class Context { cusparseCreate(sparseHandle) // Initialize JCublas library and create a dense handle for it. - var denseHandle = jcuda.JCublas.cublasInit() - cusparseCreate(denseHandle) + // // seems that there is no `dense handle` for JCublas + var denseHandle = JCublas.cublasInit() + //TODO: is this needed somehow- via the cusparse library? + // cusparseCreate(denseHandle) } diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala b/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala index eb9745c560..49b9816952 100644 --- a/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala +++ b/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala @@ -20,21 +20,32 @@ package org.apache.mahout.cuda import jcuda._ import jcuda.jcublas._ - +import jcuda.jcublas.JCublas._ +import jcuda.jcusparse.JCusparse._ +import jcuda.jcusparse._ import jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO import jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL import jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE +import jcuda.runtime._ +import jcuda.runtime.JCuda +import jcuda.runtime.JCuda._ +import jcuda.runtime.cudaMemcpyKind._ + + final class DenseRowMatrix { var vals = new jcuda.Pointer() - var trans = CUBLAS_OP_N - var descr = new CUDA_ARRAY_DESCRIPTOR() + // default = not transposed. + var trans = 'n' + var descr = new jcuda.driver.CUDA_ARRAY_DESCRIPTOR() var nrows = 0 var ncols = 0 + var context = new Context + /** * Initalize empty Dense Matrix * @param ctx @@ -46,24 +57,84 @@ final class DenseRowMatrix { nrows = nrow ncols = ncol + context = ctx - cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, vals) + // allocate empty space on the GPU + cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, jcuda.Sizeof.DOUBLE , vals) + + // create and setup matrix descriptor + // Todo: do we want these? for dense %*% sparse? + //JCuda.cublasCreateMatDescr(descr) + // cublasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) + //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) + + } + + /** + * Initalize a new Dense matrix with data supplied + * @param ctx + * @param nrow + * @param ncol + * @param data double[][] of Dense array elements + */ + def this(ctx: Context, nrow: Int, ncol: Int, data: Array[Array[Double]]) { + this() + + nrows = nrow + ncols = ncol + context = ctx + + // allocate empty space on the GPU + cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, jcuda.Sizeof.DOUBLE, vals) // create and setup matrix descriptor // Todo: do we want these? for dense %*% sparse? //cusblasCreateMatDescr(descr) //cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) - //(descr, CUSPARSE_INDEX_BASE_ZERO) - allocate() + //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) + cudaMemcpy(vals, jcuda.Pointer.to(data.toList.flatten.toArray), + (nrow) * (ncol) * jcuda.Sizeof.DOUBLE, + cudaMemcpyHostToDevice) } + /** Constructor with values on the device already. + * + * @param ctx + * @param nrow + * @param ncol + * @param data + */ + def this(ctx: Context, nrow: Int, ncol: Int, data: Pointer) { + this() + + nrows = nrow + ncols = ncol + context = ctx + + vals = data - cudaMemcpy(row_ptr, jcuda.Pointer.to(rowJumper), (nrow+1)*jcuda.Sizeof.INT, cudaMemcpyHostToDevice) + // create and setup matrix descriptor + // Todo: do we want these? for dense %*% sparse? + //cusblasCreateMatDescr(descr) + //cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) + //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) + } + + def set (data: Array[Array[Double]]): Unit = { + // Allocate row-major + cublasAlloc(data.length * data(0).length * jcuda.Sizeof.DOUBLE, + jcuda.Sizeof.DOUBLE, vals) + cudaMemcpy(vals, jcuda.Pointer.to(data.toList.flatten.toArray), + data.length * data(0).length * jcuda.Sizeof.DOUBLE, + cudaMemcpyHostToDevice) + } - def set () + def flatten2dArray(arr2d: Array[Array[Double]]): Array[Double] = { + arr2d.toList.flatten.toArray + } def close() { cublasFree(vals) diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/package.scala b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala index 8d65fc5830..6fe44e4a52 100644 --- a/cuda/src/main/scala/org/apache/mahout/cuda/package.scala +++ b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala @@ -30,36 +30,57 @@ import scala.collection.JavaConversions._ import jcuda.runtime.JCuda._ import jcuda.runtime.cudaMemcpyKind._ + import jcuda._ import jcuda.jcublas._ + + package object cuda { private implicit val log = getLog(GPUMMul.getClass) + /** Copy cuda data back into a Mahout DenseMatrix + * + * @param src a (flattened) 2D cuda array + * @return A Mahout DenseMatrix + */ + def fromVclDenseRM(src: DenseRowMatrix): Matrix = { + + + val nrowIntern = src.nrows + val ncolIntern = src.ncols + + + val dbuff = new Array.ofDim[Double](nrowIntern * ncolIntern) + + //Functions.fastCopy(src, dbuff) + var srcOffset = 0 + val ncol = src.ncols + val rows = for (irow ← 0 until src.nrow) yield { + + val rowvec = new Array[Double](ncol) + dbuff.position(srcOffset).get(rowvec) + + srcOffset += ncolIntern + rowvec + } + } + /** * Convert from Mahout DenseMatrix to matrix * @param src - * @param cudaCtx * @return */ - def toCudaDenseRM(src: Matrix): DenseRowMatrix = { - src.denseHandle match { - - case src.ctx.denseHandle ⇒ - val cudaMx = new DenseRowMatrix( - data = repackRowMajor(src, src.nrow, src.ncol), - nrow = src.nrow, - ncol = src.ncol, - ctx = cudaCtx - ) - cudaMx - case _ ⇒ - val cudaMx = new DenseRowMatrix(src.nrow, src.ncol, cudaCtx) - fastCopy(src, vclMx) + def toCudaDenseRM(src: Matrix, ctx: Context): cuda.DenseRowMatrix = { + + val valuesF = classOf[DenseMatrix].getDeclaredField("values") + valuesF.setAccessible(true) + val values = valuesF.get(src).asInstanceOf[Array[Array[Double]]] + val cudaMx = new cuda.DenseRowMatrix(ctx, src.nrow, src.ncol, values) + cudaMx - } } @@ -67,31 +88,32 @@ package object cuda { // Most Mahout in-core matrices are row-major and we're using CSR so we may need to see // if JCuda is using an optimal csr/RowMajor DGEMM algortithm. // TODO: check with NS on this - private[cuda] def repackRowMajor(mx: Matrix, nrowIntern: Int, ncolIntern: Int): DoublePointer = { - - assert(mx.nrow <= nrowIntern && mx.ncol <= ncolIntern) - - val dbuff = Array.ofDim[Double](nrowIntern, ncolIntern) - - mx match { - case dm: DenseMatrix ⇒ - val valuesF = classOf[DenseMatrix].getDeclaredField("values") - valuesF.setAccessible(true) - val values = valuesF.get(dm).asInstanceOf[Array[Array[Double]]] - var dstOffset = 0 - for (irow ← 0 until mx.nrow) { - val rowarr = values(irow) - //dbuff.position(dstOffset).put(rowarr, 0, rowarr.size min ncolIntern) - System.arraycopy(rowarr, 0, dbuff, dstOffset, rowarr.size min ncolIntern) - dstOffset += ncolIntern - } - case _ ⇒ - // Naive copying. Could be sped up for a DenseMatrix. TODO. - for (row ← mx) { - val dstOffset = row.index * ncolIntern - for (el ← row.nonZeroes) dbuff[dstOffset + el.index] = el - } - } +// private[cuda] def repackRowMajor(mx: Matrix, nrowIntern: Int, ncolIntern: Int): Array[Double] = { +// +// assert(mx.nrow <= nrowIntern && mx.ncol <= ncolIntern) +// +// val dbuff = Array.ofDim[Double](nrowIntern * ncolIntern) +// +// mx match { +// case dm: DenseMatrix ⇒ +// val valuesF = classOf[DenseMatrix].getDeclaredField("values") +// valuesF.setAccessible(true) +// val values = valuesF.get(dm).asInstanceOf[Array[Array[Double]]] +// var dstOffset = 0 +// for (irow ← 0 until mx.nrow) { +// val rowarr = values(irow) +// //dbuff.position(dstOffset).put(rowarr, 0, rowarr.size min ncolIntern) +// System.arraycopy(rowarr, 0, dbuff, dstOffset, rowarr.size min ncolIntern) +// dstOffset += ncolIntern +// } +// case _ ⇒ +// // Naive copying. Could be sped up for a DenseMatrix. TODO. +// for (row ← mx) { +// val dstOffset = row.index * ncolIntern +// for (el ← row.nonZeroes) dbuff.update(dstOffset + el.index) = el +// } +// } +// } /** * @@ -212,21 +234,31 @@ package object cuda { val n = b.ncols val k = b.nrows + val d_A = valuesF.get(a).asInstanceOf[Array[Array[Double]]] + + val c: DenseRowMatrix = new DenseRowMatrix(ctx, m, n) val d_C: Pointer = new Pointer() - - cudaMalloc(c.vals, M * N * jcuda.Sizeof.DOUBLE); + cudaMalloc(c.vals, m * n * jcuda.Sizeof.DOUBLE) // cublasSgemm('n', 'n', N, N, N, alpha, // d_A, N, d_B, N, beta, d_C, N); - cudaDgemm(ctx.denseHandle, a.trans, b.trans, m, n, k, +// JCublas.cublasSgemm('n', 'n', N, N, N, alpha, +// d_A, N, d_B, N, beta, d_C, N); + + //C = alpha * op(A) * op(B) + beta * C, + //where op(X) = X or op(X) = transpose(X), + JCublas.cublasDgemm(a.trans, b.trans, m, n, k, 1.0d, // alpha - 1.0d, 1, // Alpha, lda - 1.0d, 1, // Beta , ldb - 1.0d, // beta + a.vals, m, // A, lda + b.vals, k, // B , ldb + 0.0d, // beta d_C, // pointer to results - m * n) // size of a %*% b + n) // todo: check on this + + // + }