Skip to content

Commit

Permalink
Some minor improvements identified during benchmark (#11829)
Browse files Browse the repository at this point in the history
* Some minor improvements identified during benchmark

Signed-off-by: liurenjie1024 <[email protected]>

* Fix late initialization

---------

Signed-off-by: liurenjie1024 <[email protected]>
  • Loading branch information
liurenjie1024 authored Dec 7, 2024
1 parent 8ae6a68 commit 0fe162d
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,18 @@ class SerializedBatchIterator(dIn: DataInputStream, deserTime: GpuMetric)
class GpuColumnarBatchSerializer(metrics: Map[String, GpuMetric], dataTypes: Array[DataType],
useKudo: Boolean)
extends Serializer with Serializable {

private lazy val kudo = {
if (useKudo && dataTypes.nonEmpty) {
Some(new KudoSerializer(GpuColumnVector.from(dataTypes)))
} else {
None
}
}

override def newInstance(): SerializerInstance = {
if (useKudo) {
new KudoSerializerInstance(metrics, dataTypes)
new KudoSerializerInstance(metrics, dataTypes, kudo)
} else {
new GpuColumnarBatchSerializerInstance(metrics)
}
Expand All @@ -158,7 +167,7 @@ private class GpuColumnarBatchSerializerInstance(metrics: Map[String, GpuMetric]
val batch = value.asInstanceOf[ColumnarBatch]
val numColumns = batch.numCols()
val columns: Array[HostColumnVector] = new Array(numColumns)
val toClose = new ArrayBuffer[AutoCloseable]()
val toClose = new ArrayBuffer[AutoCloseable](numColumns)
try {
var startRow = 0
val numRows = batch.numRows()
Expand Down Expand Up @@ -338,16 +347,16 @@ object SerializedTableColumn {
*/
private class KudoSerializerInstance(
val metrics: Map[String, GpuMetric],
val dataTypes: Array[DataType]) extends SerializerInstance {
val dataTypes: Array[DataType],
val kudo: Option[KudoSerializer]
) extends SerializerInstance {
private val dataSize = metrics(METRIC_DATA_SIZE)
private val serTime = metrics(METRIC_SHUFFLE_SER_STREAM_TIME)
private val serCalcHeaderTime = metrics(METRIC_SHUFFLE_SER_CALC_HEADER_TIME)
private val serCopyHeaderTime = metrics(METRIC_SHUFFLE_SER_COPY_HEADER_TIME)
private val serCopyBufferTime = metrics(METRIC_SHUFFLE_SER_COPY_BUFFER_TIME)
private val deserTime = metrics(METRIC_SHUFFLE_DESER_STREAM_TIME)

private lazy val kudo = new KudoSerializer(GpuColumnVector.from(dataTypes))

override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
private[this] val dOut: DataOutputStream =
new DataOutputStream(new BufferedOutputStream(out))
Expand All @@ -356,7 +365,7 @@ private class KudoSerializerInstance(
val batch = value.asInstanceOf[ColumnarBatch]
val numColumns = batch.numCols()
val columns: Array[HostColumnVector] = new Array(numColumns)
withResource(new ArrayBuffer[AutoCloseable]()) { toClose =>
withResource(new ArrayBuffer[AutoCloseable](numColumns)) { toClose =>
var startRow = 0
val numRows = batch.numRows()
if (batch.numCols() > 0) {
Expand Down Expand Up @@ -384,7 +393,9 @@ private class KudoSerializerInstance(
}

withResource(new NvtxRange("Serialize Batch", NvtxColor.YELLOW)) { _ =>
val writeMetric = kudo.writeToStreamWithMetrics(columns, dOut, startRow, numRows)
val writeMetric = kudo
.getOrElse(throw new IllegalStateException("Kudo serializer not initialized."))
.writeToStreamWithMetrics(columns, dOut, startRow, numRows)

dataSize += writeMetric.getWrittenBytes
serCalcHeaderTime += writeMetric.getCalcHeaderTime
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ class KudoTableOperator(
kudoMergeHeaderTime: GpuMetric,
kudoMergeBufferTime: GpuMetric) extends SerializedTableOperator[KudoSerializedTableColumn] {
require(kudo != null, "kudo serializer should not be null")
private val kudoTables = new util.ArrayList[KudoTable]()

override def getDataLen(column: KudoSerializedTableColumn): Long = column.kudoTable.getHeader
.getTotalDataLen
Expand All @@ -251,7 +252,8 @@ class KudoTableOperator(
val totalRowsNum = columns.map(getNumRows).sum
RowCountOnlyMergeResult(totalRowsNum)
} else {
val kudoTables = new util.ArrayList[KudoTable](columns.length)
kudoTables.clear()
kudoTables.ensureCapacity(columns.length)
columns.foreach { column =>
kudoTables.add(column.kudoTable)
}
Expand Down

0 comments on commit 0fe162d

Please sign in to comment.