diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala b/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala index 49b9816952..2203d22a80 100644 --- a/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala +++ b/cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala @@ -64,9 +64,9 @@ final class DenseRowMatrix { // create and setup matrix descriptor // Todo: do we want these? for dense %*% sparse? - //JCuda.cublasCreateMatDescr(descr) - // cublasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) - //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) + // JCuda.cublasCreateMatDescr(descr) + // cublasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) + // cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) } @@ -89,14 +89,13 @@ final class DenseRowMatrix { // create and setup matrix descriptor // Todo: do we want these? for dense %*% sparse? - //cusblasCreateMatDescr(descr) - //cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) - //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) + // cusblasCreateMatDescr(descr) + // cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) + // cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) cudaMemcpy(vals, jcuda.Pointer.to(data.toList.flatten.toArray), (nrow) * (ncol) * jcuda.Sizeof.DOUBLE, cudaMemcpyHostToDevice) - } /** Constructor with values on the device already. @@ -116,13 +115,17 @@ final class DenseRowMatrix { vals = data // create and setup matrix descriptor - // Todo: do we want these? for dense %*% sparse? + // Todo: do we need these? for dense %*% sparse? //cusblasCreateMatDescr(descr) //cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) } + /**Set values with an 2d Array + * + * @param data + */ def set (data: Array[Array[Double]]): Unit = { // Allocate row-major cublasAlloc(data.length * data(0).length * jcuda.Sizeof.DOUBLE, @@ -132,7 +135,15 @@ final class DenseRowMatrix { cudaMemcpyHostToDevice) } - def flatten2dArray(arr2d: Array[Array[Double]]): Array[Double] = { + /** Set values with a pointer that is alredy created + * + * @param data + */ + def set (data: Pointer): Unit = { + vals = data + } + + private[cuda] def flatten2dArray(arr2d: Array[Array[Double]]): Array[Double] = { arr2d.toList.flatten.toArray } diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/package.scala b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala index 6fe44e4a52..9a6b2d53b6 100644 --- a/cuda/src/main/scala/org/apache/mahout/cuda/package.scala +++ b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala @@ -46,26 +46,35 @@ package object cuda { * @param src a (flattened) 2D cuda array * @return A Mahout DenseMatrix */ - def fromVclDenseRM(src: DenseRowMatrix): Matrix = { - + def fromCUDADenseRM(src: DenseRowMatrix): Matrix = { val nrowIntern = src.nrows val ncolIntern = src.ncols + var dbuff = new Pointer() - val dbuff = new Array.ofDim[Double](nrowIntern * ncolIntern) + // again will be doullbe copying.. consider copying directly from cuda memory + // into each row.. + val jvmData = Array.ofDim[Double](nrowIntern,ncolIntern) //Double](nrowIntern * ncolIntern) + val cudaData = new Array[Double](nrowIntern * ncolIntern) + cudaMemcpy(jcuda.Pointer.to(cudaData), src.vals, (nrowIntern * ncolIntern)*jcuda.Sizeof.DOUBLE, cudaMemcpyDeviceToHost) - //Functions.fastCopy(src, dbuff) + // We could speed this up by doing a transpose here + // assuming that the matrix is in columnMajor format + // TODO: consider this getting late so make it work now. var srcOffset = 0 val ncol = src.ncols - val rows = for (irow ← 0 until src.nrow) yield { + val rows = for (irow ← 0 until src.nrows) yield { val rowvec = new Array[Double](ncol) - dbuff.position(srcOffset).get(rowvec) - + System.arraycopy(cudaData, srcOffset , rowvec , 0 , ncol) srcOffset += ncolIntern rowvec } + + // Always! use shallow = true to avoid yet another copying. + // even another from viennacl :) + new DenseMatrix(rows.toArray, true) } /** @@ -84,7 +93,7 @@ package object cuda { } - // TODO replace this with repackColumnMajor and use a different dgemm algorithm? + // TODO replace this with repackColumnMajor or use a different dgemm algorithm? // Most Mahout in-core matrices are row-major and we're using CSR so we may need to see // if JCuda is using an optimal csr/RowMajor DGEMM algortithm. // TODO: check with NS on this @@ -234,32 +243,29 @@ package object cuda { val n = b.ncols val k = b.nrows - val d_A = valuesF.get(a).asInstanceOf[Array[Array[Double]]] + // val d_A = valuesF.get(a).asInstanceOf[Array[Array[Double]]] val c: DenseRowMatrix = new DenseRowMatrix(ctx, m, n) val d_C: Pointer = new Pointer() cudaMalloc(c.vals, m * n * jcuda.Sizeof.DOUBLE) - // cublasSgemm('n', 'n', N, N, N, alpha, - // d_A, N, d_B, N, beta, d_C, N); - -// JCublas.cublasSgemm('n', 'n', N, N, N, alpha, -// d_A, N, d_B, N, beta, d_C, N); - //C = alpha * op(A) * op(B) + beta * C, //where op(X) = X or op(X) = transpose(X), - JCublas.cublasDgemm(a.trans, b.trans, m, n, k, + // using transpose here because Mahout Matrices in general + // are row-major, hardcoding this for now.. + JCublas.cublasDgemm('t', 't', m, n, k, 1.0d, // alpha a.vals, m, // A, lda b.vals, k, // B , ldb 0.0d, // beta d_C, // pointer to results - n) // todo: check on this - - // - + n) // todo: check on this are we correct here? + // set the data of c to the results + // may need to allocate data here or the other side. + c.set(d_C) + c } /** diff --git a/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala b/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala index 5222cc1106..f0b22c0249 100644 --- a/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala +++ b/cuda/src/test/scala/org/apache/mahout/cuda/CUDATestSuite.scala @@ -6,9 +6,7 @@ import scalabindings.RLikeOps._ import scala.util.Random -/** - * Created by andy on 3/29/17. - */ + class CUDATestSuite extends FunSuite with Matchers {