Skip to content

Commit

Permalink
[WIP][nocompile] out of time tonight. Need to consider wheather JCUDA…
Browse files Browse the repository at this point in the history
… needs to repacked mahout in-core matrices (row-major) as column major per the wrapped call to NVIDIA cublasDgemm operation. The fact that cusparse uses CSR rather than CSC is somewhat confusing in this case (for e.g. a Dense %*% Sparse JVM level operation
  • Loading branch information
andrewpalumbo committed May 1, 2017
1 parent eaddbf0 commit 808660d
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 34 deletions.
6 changes: 6 additions & 0 deletions cuda/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,12 @@
<version>${jcuda.jcudaVersion}</version>
</dependency>

<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcublas</artifactId>
<version>${jcuda.jcudaVersion}</version>
</dependency>

<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcusparse</artifactId>
Expand Down
13 changes: 9 additions & 4 deletions cuda/src/main/scala/org/apache/mahout/cuda/Context.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,18 @@ import jcuda.jcusparse._
import jcuda.runtime.JCuda

final class Context {

// Enable exceptions for all CUDA libraries
JCuda.setExceptionsEnabled(true)
JCusparse.setExceptionsEnabled(true)

// Initialize JCusparse library
var handle: jcuda.jcusparse.cusparseHandle = new cusparseHandle()
cusparseCreate(handle)
// Initialize JCusparse library and create a dense handle for it.
var sparseHandle: jcuda.jcusparse.cusparseHandle = new cusparseHandle()
cusparseCreate(sparseHandle)

// Initialize JCublas library and create a dense handle for it.
var denseHandle = jcuda.JCublas.cublasInit()
cusparseCreate(denseHandle)


}

Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,52 @@ package org.apache.mahout.cuda
import jcuda._
import jcuda.jcublas._

final class DenseMatrix {
import jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO
import jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL
import jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE

final class DenseRowMatrix {

var vals = new jcuda.Pointer()

var trans = CUSPARSE_OPERATION_NON_TRANSPOSE // use dense
var descr = new cusparseMatDescr()
var trans = CUBLAS_OP_N
var descr = new CUDA_ARRAY_DESCRIPTOR()

var nrows = 0
var ncols = 0


/**
* Initalize empty Dense Matrix
* @param ctx
* @param nrow
* @param ncol
*/
def this(ctx: Context, nrow: Int, ncol: Int) {
this()

nrows = nrow
ncols = ncol

nonz = nonzeros
if (nonzeros > 0) {
cudaMalloc(vals, nonzeros*jcuda.Sizeof.DOUBLE)
}
cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, vals)

// create and setup matrix descriptor
cusparseCreateMatDescr(descr)
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)
// Todo: do we want these? for dense %*% sparse?
//cusblasCreateMatDescr(descr)
//cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
//(descr, CUSPARSE_INDEX_BASE_ZERO)
allocate()


}

//def set (...)

cudaMemcpy(row_ptr, jcuda.Pointer.to(rowJumper), (nrow+1)*jcuda.Sizeof.INT, cudaMemcpyHostToDevice)


def set ()

def close() {
cudaFree(row_ptr)
if (nonz > 0) {
cudaFree(col_ind)
cudaFree(vals)
}
cublasFree(vals)
}
}

97 changes: 84 additions & 13 deletions cuda/src/main/scala/org/apache/mahout/cuda/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,68 @@ import scala.collection.JavaConversions._
import jcuda.runtime.JCuda._
import jcuda.runtime.cudaMemcpyKind._
import jcuda._
import jcuda.jcublas._


package object cuda {

private implicit val log = getLog(GPUMMul.getClass)

/**
* Convert from Mahout DenseMatrix to matrix
* @param src
* @param cudaCtx
* @return
*/
def toCudaDenseRM(src: Matrix): DenseRowMatrix = {
src.denseHandle match {

case src.ctx.denseHandle
val cudaMx = new DenseRowMatrix(
data = repackRowMajor(src, src.nrow, src.ncol),
nrow = src.nrow,
ncol = src.ncol,
ctx = cudaCtx
)
cudaMx
case _
val cudaMx = new DenseRowMatrix(src.nrow, src.ncol, cudaCtx)
fastCopy(src, vclMx)
cudaMx
}
}


// TODO replace this with repackColumnMajor and use a different dgemm algorithm?
// Most Mahout in-core matrices are row-major and we're using CSR so we may need to see
// if JCuda is using an optimal csr/RowMajor DGEMM algortithm.
// TODO: check with NS on this
private[cuda] def repackRowMajor(mx: Matrix, nrowIntern: Int, ncolIntern: Int): DoublePointer = {

assert(mx.nrow <= nrowIntern && mx.ncol <= ncolIntern)

val dbuff = Array.ofDim[Double](nrowIntern, ncolIntern)

mx match {
case dm: DenseMatrix
val valuesF = classOf[DenseMatrix].getDeclaredField("values")
valuesF.setAccessible(true)
val values = valuesF.get(dm).asInstanceOf[Array[Array[Double]]]
var dstOffset = 0
for (irow 0 until mx.nrow) {
val rowarr = values(irow)
//dbuff.position(dstOffset).put(rowarr, 0, rowarr.size min ncolIntern)
System.arraycopy(rowarr, 0, dbuff, dstOffset, rowarr.size min ncolIntern)
dstOffset += ncolIntern
}
case _
// Naive copying. Could be sped up for a DenseMatrix. TODO.
for (row mx) {
val dstOffset = row.index * ncolIntern
for (el row.nonZeroes) dbuff[dstOffset + el.index] = el
}
}

/**
*
* @param mxSrc
Expand Down Expand Up @@ -144,28 +200,43 @@ package object cuda {
(jumpers, colIdcs, els)
}

def prod(a: DenseMatrix, b: DenseMatrix, ctx: Context): CompressedMatrix = {
/**
* Dense %*% Dense
* @param a
* @param b
* @param ctx
* @return
*/
def prod(a: DenseRowMatrix, b: DenseRowMatrix, ctx: Context): DenseRowMatrix = {
val m = a.nrows
val n = b.ncols
val k = b.nrows

val c: DenseMatrix = new DenseMatrix(ctx, m, n)
val c: DenseRowMatrix = new DenseRowMatrix(ctx, m, n)
val d_C: Pointer = new Pointer()

cudaMalloc(c.vals, jcuda.Sizeof.DOUBLE M * N * jcuda.Sizeof.DOUBLE);
cudaMalloc(c.vals, M * N * jcuda.Sizeof.DOUBLE);

// cublasSgemm('n', 'n', N, N, N, alpha,
d_A, N, d_B, N, beta, d_C, N);

// d_A, N, d_B, N, beta, d_C, N);

cudaDgemm(ctx.denseHandle, a.trans, b.trans, m, n, k,
1.0d, // alpha
1.0d, 1, // Alpha, lda
1.0d, 1, // Beta , ldb
1.0d, // beta
d_C, // pointer to results
m * n) // size of a %*% b

cudaDgemm(ctx.handle, a.trans, b.trans, m, n, k,
0.0d, // alpha
0.0d, 0, // Alpha, lda
0.0d, 0, // Beta , ldb
0.0d, // beta
0.0)
}

/**
* Sparse %*% Sparse
* @param a
* @param b
* @param ctx
* @return
*/
def prod(a: CompressedMatrix, b: CompressedMatrix, ctx: Context): CompressedMatrix = {
var m = a.nrows
var n = b.ncols
Expand All @@ -176,7 +247,7 @@ package object cuda {
// step 1: compute nnz count
var nnzC = new Array[Int](1)
nnzC(0) = 0
cusparseXcsrgemmNnz(ctx.handle, a.trans, b.trans, m, n, k,
cusparseXcsrgemmNnz(ctx.sparseHandle, a.trans, b.trans, m, n, k,
a.descr, a.nonz, a.row_ptr, a.col_ind,
b.descr, b.nonz, b.row_ptr, b.col_ind,
c.descr, c.row_ptr, jcuda.Pointer.to(nnzC))
Expand All @@ -191,7 +262,7 @@ package object cuda {
// step 2: allocate and compute matrix product
cudaMalloc(c.col_ind, jcuda.Sizeof.INT * c.nonz);
cudaMalloc(c.vals, jcuda.Sizeof.DOUBLE * c.nonz);
cusparseDcsrgemm(ctx.handle, a.trans, b.trans, m, n, k,
cusparseDcsrgemm(ctx.sparseHandle, a.trans, b.trans, m, n, k,
a.descr, a.nonz,
a.vals, a.row_ptr, a.col_ind,
b.descr, b.nonz,
Expand Down

0 comments on commit 808660d

Please sign in to comment.