Skip to content

Commit

Permalink
WIP: Dense
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewpalumbo committed May 7, 2017
1 parent 808660d commit 1674687
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 65 deletions.
8 changes: 1 addition & 7 deletions cuda/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
<parent>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout</artifactId>
<version>0.13.0-SNAPSHOT</version>
<version>0.13.1-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -175,12 +175,6 @@
<artifactId>scalatest_${scala.compat.version}</artifactId>
</dependency>

<dependency>
<groupId>org.bytedeco</groupId>
<artifactId>javacpp</artifactId>
<version>1.2.4</version>
</dependency>

<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId>
Expand Down
10 changes: 8 additions & 2 deletions cuda/src/main/scala/org/apache/mahout/cuda/Context.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ import jcuda.jcusparse.JCusparse._
import jcuda.jcusparse._
import jcuda.runtime.JCuda

import jcuda._
import jcublas._
import JCublas._

final class Context {
// Enable exceptions for all CUDA libraries
JCuda.setExceptionsEnabled(true)
Expand All @@ -33,8 +37,10 @@ final class Context {
cusparseCreate(sparseHandle)

// Initialize JCublas library and create a dense handle for it.
var denseHandle = jcuda.JCublas.cublasInit()
cusparseCreate(denseHandle)
// // seems that there is no `dense handle` for JCublas
var denseHandle = JCublas.cublasInit()
//TODO: is this needed somehow- via the cusparse library?
// cusparseCreate(denseHandle)


}
Expand Down
87 changes: 79 additions & 8 deletions cuda/src/main/scala/org/apache/mahout/cuda/DenseRowMatrix.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,32 @@ package org.apache.mahout.cuda

import jcuda._
import jcuda.jcublas._

import jcuda.jcublas.JCublas._
import jcuda.jcusparse.JCusparse._
import jcuda.jcusparse._
import jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO
import jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL
import jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE

import jcuda.runtime._
import jcuda.runtime.JCuda
import jcuda.runtime.JCuda._
import jcuda.runtime.cudaMemcpyKind._


final class DenseRowMatrix {

var vals = new jcuda.Pointer()

var trans = CUBLAS_OP_N
var descr = new CUDA_ARRAY_DESCRIPTOR()
// default = not transposed.
var trans = 'n'
var descr = new jcuda.driver.CUDA_ARRAY_DESCRIPTOR()

var nrows = 0
var ncols = 0

var context = new Context

/**
* Initalize empty Dense Matrix
* @param ctx
Expand All @@ -46,24 +57,84 @@ final class DenseRowMatrix {

nrows = nrow
ncols = ncol
context = ctx

cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, vals)
// allocate empty space on the GPU
cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, jcuda.Sizeof.DOUBLE , vals)

// create and setup matrix descriptor
// Todo: do we want these? for dense %*% sparse?
//JCuda.cublasCreateMatDescr(descr)
// cublasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
//cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)

}

/**
* Initalize a new Dense matrix with data supplied
* @param ctx
* @param nrow
* @param ncol
* @param data double[][] of Dense array elements
*/
def this(ctx: Context, nrow: Int, ncol: Int, data: Array[Array[Double]]) {
this()

nrows = nrow
ncols = ncol
context = ctx

// allocate empty space on the GPU
cublasAlloc(nrows * ncols * jcuda.Sizeof.DOUBLE, jcuda.Sizeof.DOUBLE, vals)

// create and setup matrix descriptor
// Todo: do we want these? for dense %*% sparse?
//cusblasCreateMatDescr(descr)
//cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
//(descr, CUSPARSE_INDEX_BASE_ZERO)
allocate()
//cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)

cudaMemcpy(vals, jcuda.Pointer.to(data.toList.flatten.toArray),
(nrow) * (ncol) * jcuda.Sizeof.DOUBLE,
cudaMemcpyHostToDevice)

}

/** Constructor with values on the device already.
*
* @param ctx
* @param nrow
* @param ncol
* @param data
*/
def this(ctx: Context, nrow: Int, ncol: Int, data: Pointer) {
this()

nrows = nrow
ncols = ncol
context = ctx

vals = data

cudaMemcpy(row_ptr, jcuda.Pointer.to(rowJumper), (nrow+1)*jcuda.Sizeof.INT, cudaMemcpyHostToDevice)
// create and setup matrix descriptor
// Todo: do we want these? for dense %*% sparse?
//cusblasCreateMatDescr(descr)
//cusblasSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
//cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)

}

def set (data: Array[Array[Double]]): Unit = {
// Allocate row-major
cublasAlloc(data.length * data(0).length * jcuda.Sizeof.DOUBLE,
jcuda.Sizeof.DOUBLE, vals)
cudaMemcpy(vals, jcuda.Pointer.to(data.toList.flatten.toArray),
data.length * data(0).length * jcuda.Sizeof.DOUBLE,
cudaMemcpyHostToDevice)
}

def set ()
def flatten2dArray(arr2d: Array[Array[Double]]): Array[Double] = {
arr2d.toList.flatten.toArray
}

def close() {
cublasFree(vals)
Expand Down
128 changes: 80 additions & 48 deletions cuda/src/main/scala/org/apache/mahout/cuda/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,68 +30,90 @@ import scala.collection.JavaConversions._

import jcuda.runtime.JCuda._
import jcuda.runtime.cudaMemcpyKind._

import jcuda._
import jcuda.jcublas._




package object cuda {

private implicit val log = getLog(GPUMMul.getClass)

/** Copy cuda data back into a Mahout DenseMatrix
*
* @param src a (flattened) 2D cuda array
* @return A Mahout DenseMatrix
*/
def fromVclDenseRM(src: DenseRowMatrix): Matrix = {


val nrowIntern = src.nrows
val ncolIntern = src.ncols


val dbuff = new Array.ofDim[Double](nrowIntern * ncolIntern)

//Functions.fastCopy(src, dbuff)
var srcOffset = 0
val ncol = src.ncols
val rows = for (irow 0 until src.nrow) yield {

val rowvec = new Array[Double](ncol)
dbuff.position(srcOffset).get(rowvec)

srcOffset += ncolIntern
rowvec
}
}

/**
* Convert from Mahout DenseMatrix to matrix
* @param src
* @param cudaCtx
* @return
*/
def toCudaDenseRM(src: Matrix): DenseRowMatrix = {
src.denseHandle match {

case src.ctx.denseHandle
val cudaMx = new DenseRowMatrix(
data = repackRowMajor(src, src.nrow, src.ncol),
nrow = src.nrow,
ncol = src.ncol,
ctx = cudaCtx
)
cudaMx
case _
val cudaMx = new DenseRowMatrix(src.nrow, src.ncol, cudaCtx)
fastCopy(src, vclMx)
def toCudaDenseRM(src: Matrix, ctx: Context): cuda.DenseRowMatrix = {

val valuesF = classOf[DenseMatrix].getDeclaredField("values")
valuesF.setAccessible(true)
val values = valuesF.get(src).asInstanceOf[Array[Array[Double]]]
val cudaMx = new cuda.DenseRowMatrix(ctx, src.nrow, src.ncol, values)

cudaMx
}
}


// TODO replace this with repackColumnMajor and use a different dgemm algorithm?
// Most Mahout in-core matrices are row-major and we're using CSR so we may need to see
// if JCuda is using an optimal csr/RowMajor DGEMM algortithm.
// TODO: check with NS on this
private[cuda] def repackRowMajor(mx: Matrix, nrowIntern: Int, ncolIntern: Int): DoublePointer = {

assert(mx.nrow <= nrowIntern && mx.ncol <= ncolIntern)

val dbuff = Array.ofDim[Double](nrowIntern, ncolIntern)

mx match {
case dm: DenseMatrix
val valuesF = classOf[DenseMatrix].getDeclaredField("values")
valuesF.setAccessible(true)
val values = valuesF.get(dm).asInstanceOf[Array[Array[Double]]]
var dstOffset = 0
for (irow 0 until mx.nrow) {
val rowarr = values(irow)
//dbuff.position(dstOffset).put(rowarr, 0, rowarr.size min ncolIntern)
System.arraycopy(rowarr, 0, dbuff, dstOffset, rowarr.size min ncolIntern)
dstOffset += ncolIntern
}
case _
// Naive copying. Could be sped up for a DenseMatrix. TODO.
for (row mx) {
val dstOffset = row.index * ncolIntern
for (el row.nonZeroes) dbuff[dstOffset + el.index] = el
}
}
// private[cuda] def repackRowMajor(mx: Matrix, nrowIntern: Int, ncolIntern: Int): Array[Double] = {
//
// assert(mx.nrow <= nrowIntern && mx.ncol <= ncolIntern)
//
// val dbuff = Array.ofDim[Double](nrowIntern * ncolIntern)
//
// mx match {
// case dm: DenseMatrix ⇒
// val valuesF = classOf[DenseMatrix].getDeclaredField("values")
// valuesF.setAccessible(true)
// val values = valuesF.get(dm).asInstanceOf[Array[Array[Double]]]
// var dstOffset = 0
// for (irow ← 0 until mx.nrow) {
// val rowarr = values(irow)
// //dbuff.position(dstOffset).put(rowarr, 0, rowarr.size min ncolIntern)
// System.arraycopy(rowarr, 0, dbuff, dstOffset, rowarr.size min ncolIntern)
// dstOffset += ncolIntern
// }
// case _ ⇒
// // Naive copying. Could be sped up for a DenseMatrix. TODO.
// for (row ← mx) {
// val dstOffset = row.index * ncolIntern
// for (el ← row.nonZeroes) dbuff.update(dstOffset + el.index) = el
// }
// }
// }

/**
*
Expand Down Expand Up @@ -212,21 +234,31 @@ package object cuda {
val n = b.ncols
val k = b.nrows

val d_A = valuesF.get(a).asInstanceOf[Array[Array[Double]]]


val c: DenseRowMatrix = new DenseRowMatrix(ctx, m, n)
val d_C: Pointer = new Pointer()

cudaMalloc(c.vals, M * N * jcuda.Sizeof.DOUBLE);
cudaMalloc(c.vals, m * n * jcuda.Sizeof.DOUBLE)

// cublasSgemm('n', 'n', N, N, N, alpha,
// d_A, N, d_B, N, beta, d_C, N);

cudaDgemm(ctx.denseHandle, a.trans, b.trans, m, n, k,
// JCublas.cublasSgemm('n', 'n', N, N, N, alpha,
// d_A, N, d_B, N, beta, d_C, N);

//C = alpha * op(A) * op(B) + beta * C,
//where op(X) = X or op(X) = transpose(X),
JCublas.cublasDgemm(a.trans, b.trans, m, n, k,
1.0d, // alpha
1.0d, 1, // Alpha, lda
1.0d, 1, // Beta , ldb
1.0d, // beta
a.vals, m, // A, lda
b.vals, k, // B , ldb
0.0d, // beta
d_C, // pointer to results
m * n) // size of a %*% b
n) // todo: check on this

//


}

Expand Down

0 comments on commit 1674687

Please sign in to comment.