[WIP][nocompile] out of time tonight. Need to consider wheather JCUDA…

… needs to repacked mahout in-core matrices (row-major) as column major per the wrapped call to NVIDIA cublasDgemm operation. The fact that cusparse uses CSR rather than CSC is somewhat confusing in this case (for e.g. a Dense %*% Sparse JVM level operation
apache · May 1, 2017 · 808660d · 808660d
1 parent eaddbf0
commit 808660d
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 34 deletions.
diff --git a/cuda/pom.xml b/cuda/pom.xml
@@ -187,6 +187,12 @@
       <version>${jcuda.jcudaVersion}</version>
     </dependency>
 
+    <dependency>
+      <groupId>org.jcuda</groupId>
+      <artifactId>jcublas</artifactId>
+      <version>${jcuda.jcudaVersion}</version>
+    </dependency>
+
     <dependency>
       <groupId>org.jcuda</groupId>
       <artifactId>jcusparse</artifactId>

diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala b/cuda/src/main/scala/org/apache/mahout/cuda/Context.scala
@@ -24,13 +24,18 @@ import jcuda.jcusparse._
 import jcuda.runtime.JCuda
 
 final class Context {
-
   // Enable exceptions for all CUDA libraries
   JCuda.setExceptionsEnabled(true)
   JCusparse.setExceptionsEnabled(true)
 
-  // Initialize JCusparse library
-  var handle: jcuda.jcusparse.cusparseHandle = new cusparseHandle()
-  cusparseCreate(handle)
+  // Initialize JCusparse library and create a dense handle for it.
+  var sparseHandle: jcuda.jcusparse.cusparseHandle = new cusparseHandle()
+  cusparseCreate(sparseHandle)
+
+  // Initialize JCublas library and create a dense handle for it.
+  var denseHandle = jcuda.JCublas.cublasInit()
+  cusparseCreate(denseHandle)
+
+
 }
 
diff --git a/.../org/apache/mahout/cuda/DenseMatrix.scala → ...g/apache/mahout/cuda/DenseRowMatrix.scala b/.../org/apache/mahout/cuda/DenseMatrix.scala → ...g/apache/mahout/cuda/DenseRowMatrix.scala
@@ -21,42 +21,52 @@ package org.apache.mahout.cuda
 import jcuda._
 import jcuda.jcublas._
 
-final class DenseMatrix {
+import jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO
+import jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL
+import jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE
+
+final class DenseRowMatrix {
 
   var vals = new jcuda.Pointer()
 
-  var trans = CUSPARSE_OPERATION_NON_TRANSPOSE  // use dense
-  var descr = new cusparseMatDescr()
+  var trans = CUBLAS_OP_N
+  var descr = new CUDA_ARRAY_DESCRIPTOR()
 
   var nrows = 0
   var ncols = 0
 
-
+  /**
+    * Initalize empty Dense Matrix
+    * @param ctx
+    * @param nrow
+    * @param ncol
+    */
   def this(ctx: Context, nrow: Int, ncol: Int) {
     this()
 
     nrows = nrow
     ncols = ncol
 
-    nonz = nonzeros
-    if (nonzeros > 0) {
-      cudaMalloc(vals, nonzeros*jcuda.Sizeof.DOUBLE)
-    }
+    cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, vals)
 
     // create and setup matrix descriptor
-    cusparseCreateMatDescr(descr)
-    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
-    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)
+    // Todo: do we want these? for dense %*% sparse?
+    //cusblasCreateMatDescr(descr)
+    //cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
+    //(descr, CUSPARSE_INDEX_BASE_ZERO)
+    allocate()
+
+
   }
 
-  //def set (...)
+
+  cudaMemcpy(row_ptr, jcuda.Pointer.to(rowJumper), (nrow+1)*jcuda.Sizeof.INT, cudaMemcpyHostToDevice)
+
+
+  def set ()
 
   def close() {
-    cudaFree(row_ptr)
-    if (nonz > 0) {
-      cudaFree(col_ind)
-      cudaFree(vals)
-    }
+    cublasFree(vals)
   }
 }
 
diff --git a/cuda/src/main/scala/org/apache/mahout/cuda/package.scala b/cuda/src/main/scala/org/apache/mahout/cuda/package.scala
@@ -31,12 +31,68 @@ import scala.collection.JavaConversions._
 import jcuda.runtime.JCuda._
 import jcuda.runtime.cudaMemcpyKind._
 import jcuda._
+import jcuda.jcublas._
 
 
 package object cuda {
 
   private implicit val log = getLog(GPUMMul.getClass)
 
+  /**
+    * Convert from Mahout DenseMatrix to matrix
+    * @param src
+    * @param cudaCtx
+    * @return
+    */
+  def toCudaDenseRM(src: Matrix): DenseRowMatrix = {
+    src.denseHandle match {
+
+      case src.ctx.denseHandle ⇒
+        val cudaMx = new DenseRowMatrix(
+          data = repackRowMajor(src, src.nrow, src.ncol),
+          nrow = src.nrow,
+          ncol = src.ncol,
+          ctx = cudaCtx
+        )
+        cudaMx
+      case _ ⇒
+        val cudaMx = new DenseRowMatrix(src.nrow, src.ncol, cudaCtx)
+        fastCopy(src, vclMx)
+        cudaMx
+    }
+  }
+
+
+  // TODO replace this with repackColumnMajor and use a different dgemm algorithm?
+  // Most Mahout in-core matrices are row-major and we're using CSR so we may need to see
+  // if JCuda is using an optimal csr/RowMajor DGEMM algortithm.
+  // TODO: check with NS on this
+  private[cuda] def repackRowMajor(mx: Matrix, nrowIntern: Int, ncolIntern: Int): DoublePointer = {
+
+    assert(mx.nrow <= nrowIntern && mx.ncol <= ncolIntern)
+
+    val dbuff = Array.ofDim[Double](nrowIntern, ncolIntern)
+
+    mx match {
+      case dm: DenseMatrix ⇒
+        val valuesF = classOf[DenseMatrix].getDeclaredField("values")
+        valuesF.setAccessible(true)
+        val values = valuesF.get(dm).asInstanceOf[Array[Array[Double]]]
+        var dstOffset = 0
+        for (irow ← 0 until mx.nrow) {
+          val rowarr = values(irow)
+          //dbuff.position(dstOffset).put(rowarr, 0, rowarr.size min ncolIntern)
+          System.arraycopy(rowarr, 0, dbuff, dstOffset, rowarr.size min ncolIntern)
+          dstOffset += ncolIntern
+        }
+      case _ ⇒
+        // Naive copying. Could be sped up for a DenseMatrix. TODO.
+        for (row ← mx) {
+          val dstOffset = row.index * ncolIntern
+          for (el ← row.nonZeroes) dbuff[dstOffset + el.index] = el
+        }
+    }
+
   /**
     *
     * @param mxSrc
@@ -144,28 +200,43 @@ package object cuda {
     (jumpers, colIdcs, els)
   }
 
-  def prod(a: DenseMatrix, b: DenseMatrix, ctx: Context): CompressedMatrix = {
+  /**
+    * Dense %*% Dense
+    * @param a
+    * @param b
+    * @param ctx
+    * @return
+    */
+  def prod(a: DenseRowMatrix, b: DenseRowMatrix, ctx: Context): DenseRowMatrix = {
     val m = a.nrows
     val n = b.ncols
     val k = b.nrows
 
-    val c: DenseMatrix = new DenseMatrix(ctx, m, n)
+    val c: DenseRowMatrix = new DenseRowMatrix(ctx, m, n)
+    val d_C: Pointer = new Pointer()
 
-    cudaMalloc(c.vals, jcuda.Sizeof.DOUBLE M * N * jcuda.Sizeof.DOUBLE);
+    cudaMalloc(c.vals,  M * N * jcuda.Sizeof.DOUBLE);
 
     // cublasSgemm('n', 'n', N, N, N, alpha,
-      d_A, N, d_B, N, beta, d_C, N);
-
+    //  d_A, N, d_B, N, beta, d_C, N);
 
+    cudaDgemm(ctx.denseHandle, a.trans, b.trans, m, n, k,
+      1.0d,    // alpha
+      1.0d, 1, // Alpha, lda
+      1.0d, 1, // Beta , ldb
+      1.0d,    // beta
+      d_C,     // pointer to results
+      m * n)    // size of a %*% b
 
-    cudaDgemm(ctx.handle, a.trans, b.trans, m, n, k,
-      0.0d,    // alpha
-      0.0d, 0, // Alpha, lda
-      0.0d, 0, // Beta , ldb
-      0.0d,    // beta
-      0.0)
   }
 
+  /**
+    * Sparse %*% Sparse
+    * @param a
+    * @param b
+    * @param ctx
+    * @return
+    */
   def prod(a: CompressedMatrix, b: CompressedMatrix, ctx: Context): CompressedMatrix = {
     var m = a.nrows
     var n = b.ncols
@@ -176,7 +247,7 @@ package object cuda {
     // step 1: compute nnz count
     var nnzC = new Array[Int](1)
     nnzC(0) = 0
-    cusparseXcsrgemmNnz(ctx.handle, a.trans, b.trans, m, n, k,
+    cusparseXcsrgemmNnz(ctx.sparseHandle, a.trans, b.trans, m, n, k,
       a.descr, a.nonz, a.row_ptr, a.col_ind,
       b.descr, b.nonz, b.row_ptr, b.col_ind,
       c.descr, c.row_ptr, jcuda.Pointer.to(nnzC))
@@ -191,7 +262,7 @@ package object cuda {
     // step 2: allocate and compute matrix product
     cudaMalloc(c.col_ind, jcuda.Sizeof.INT * c.nonz);
     cudaMalloc(c.vals, jcuda.Sizeof.DOUBLE * c.nonz);
-    cusparseDcsrgemm(ctx.handle, a.trans, b.trans, m, n, k,
+    cusparseDcsrgemm(ctx.sparseHandle, a.trans, b.trans, m, n, k,
       a.descr, a.nonz,
       a.vals, a.row_ptr, a.col_ind,
       b.descr, b.nonz,