From f557976b4e0513dbfbf89f9070b6e34485de743c Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 2 Jul 2024 16:30:47 -0500
Subject: [PATCH 1/5] Adding GC Metrics

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../rapids/tool/benchmarks/Benchmark.scala    | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
index f4359c4a6..33f4ca74f 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
@@ -17,7 +17,9 @@
 package org.apache.spark.rapids.tool.benchmarks
 
 import java.io.{OutputStream, PrintStream}
+import java.lang.management.ManagementFactory
 
+import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration.NANOSECONDS
@@ -101,20 +103,30 @@ class Benchmark(
     // The results are going to be processor specific so it is useful to include that.
     out.println(RuntimeUtil.getJVMOSInfo.mkString("\n"))
     val nameLen = Math.max(40, Math.max(name.length, benchmarks.map(_.name.length).max))
-    out.printf(s"%-${nameLen}s %14s %14s %11s %10s\n",
-      name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)", "Relative")
+    out.printf(s"%-${nameLen}s %14s %14s %11s %10s %10s %10s\n",
+      name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)","Max GC Time(ms)",
+      "Max GC Count", "Relative")
     out.println("-" * (nameLen + 80))
     results.zip(benchmarks).foreach { case (result, benchmark) =>
-      out.printf(s"%-${nameLen}s %14s %14s %11s %10s\n",
+      out.printf(s"%-${nameLen}s %14s %14s %11s %10s %10s %10s\n",
         benchmark.name,
         "%5.0f" format result.bestMs,
         "%4.0f" format result.avgMs,
         "%5.0f" format result.stdevMs,
+        "%8d" format result.maxGcTime,
+        "%5d" format result.maxGcCount,
         "%3.1fX" format (firstBest / result.bestMs))
     }
     out.println()
   }
 
+  private def getGcMetrics: (Any, Any) = {
+    val gcBeans = ManagementFactory.getGarbageCollectorMXBeans
+    val gcTime = gcBeans.map(_.getCollectionTime).sum
+    val gcCount = gcBeans.map(_.getCollectionCount).sum
+    (gcTime, gcCount)
+  }
+
   /**
    * Runs a single function `f` for iters, returning the average time the function took and
    * the rate of the function.
@@ -127,17 +139,23 @@ class Benchmark(
     val minIters = if (overrideNumIters != 0) overrideNumIters else minNumIters
     val runTimes = ArrayBuffer[Long]()
     var totalTime = 0L
+    var maxGcCount = 0L
+    var maxGcTime = 0L
     for (i <- 0 until minIters) {
       val timer = new ToolsTimer(i)
+      val (gcTimeBefore, gcCountBefore) = getGcMetrics
       f(timer)
       val runTime = timer.totalTime()
       runTimes += runTime
       totalTime += runTime
-
+      val (gcTimeAfter, gcCountAfter) = getGcMetrics
+      maxGcTime = math.max(maxGcTime, gcTimeAfter.asInstanceOf[Long] - gcTimeBefore.asInstanceOf[Long])
+      maxGcCount = math.max(maxGcCount, gcCountAfter.asInstanceOf[Long] - gcCountBefore.asInstanceOf[Long])
       if (outputPerIteration) {
         // scalastyle:off
         println("*"*80)
         println(s"Iteration $i took ${NANOSECONDS.toMicros(runTime)} microseconds")
+        println(s"Iteration $i GC time ${gcTimeAfter.asInstanceOf[Long] - gcTimeBefore.asInstanceOf[Long]}")
         println("*"*80)
         // scalastyle:on
       }
@@ -153,12 +171,13 @@ class Benchmark(
     val stdev = if (runTimes.size > 1) {
       math.sqrt(runTimes.map(time => (time - avg) * (time - avg)).sum / (runTimes.size - 1))
     } else 0
-    Benchmark.Result(avg / 1000000.0,  best / 1000000.0, stdev / 1000000.0)
+    Benchmark.Result(avg / 1000000.0,  best / 1000000.0, stdev / 1000000.0, maxGcTime, maxGcCount)
   }
 }
 
 
 object Benchmark {
   case class Case(name: String, fn: ToolsTimer => Unit, numIters: Int)
-  case class Result(avgMs: Double, bestMs: Double, stdevMs: Double)
+  case class Result(avgMs: Double, bestMs: Double, stdevMs: Double,
+      maxGcTime: Long = 0, maxGcCount: Long = 0)
 }

From 4eb9997c068e4e3deb9c11decec21f2eee4627b4 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Wed, 3 Jul 2024 15:10:33 -0500
Subject: [PATCH 2/5] Review comment changes

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../rapids/tool/benchmarks/Benchmark.scala    | 52 +++++++++++--------
 .../tool/util/MemoryMetricsTracker.scala      | 45 ++++++++++++++++
 2 files changed, 74 insertions(+), 23 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala

diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
index 33f4ca74f..8ecf6b574 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
@@ -17,16 +17,14 @@
 package org.apache.spark.rapids.tool.benchmarks
 
 import java.io.{OutputStream, PrintStream}
-import java.lang.management.ManagementFactory
 
-import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration.NANOSECONDS
 
 import org.apache.commons.io.output.TeeOutputStream
 
-import org.apache.spark.sql.rapids.tool.util.{RuntimeUtil, ToolsTimer}
+import org.apache.spark.sql.rapids.tool.util.{MemoryMetricsTracker, RuntimeUtil, ToolsTimer}
 
 /**
  * This code is mostly copied from org.apache.spark.benchmark.BenchmarkBase
@@ -103,30 +101,26 @@ class Benchmark(
     // The results are going to be processor specific so it is useful to include that.
     out.println(RuntimeUtil.getJVMOSInfo.mkString("\n"))
     val nameLen = Math.max(40, Math.max(name.length, benchmarks.map(_.name.length).max))
-    out.printf(s"%-${nameLen}s %14s %14s %11s %10s %10s %10s\n",
+    out.printf(s"%-${nameLen}s %14s %14s %11s %20s %20s %20s %20s %20s %10s\n",
       name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)","Max GC Time(ms)",
-      "Max GC Count", "Relative")
-    out.println("-" * (nameLen + 80))
+      "Max GC Count", "Avg Free Memory(MB)","Avg Used Memory(MB)", "Max Heap Memory(MB)", "Relative")
+    out.println("-" * (nameLen + 160))
     results.zip(benchmarks).foreach { case (result, benchmark) =>
-      out.printf(s"%-${nameLen}s %14s %14s %11s %10s %10s %10s\n",
+      out.printf(s"%-${nameLen}s %14s %14s %11s %20s %20s %20s %20s %20s %10s\n",
         benchmark.name,
         "%5.0f" format result.bestMs,
         "%4.0f" format result.avgMs,
         "%5.0f" format result.stdevMs,
-        "%8d" format result.maxGcTime,
-        "%5d" format result.maxGcCount,
+        "%8d" format result.memoryParams.maxGCTime,
+        "%5d" format result.memoryParams.maxGCTime,
+        "%5.0f" format result.memoryParams.avgFreeHeapMemory,
+        "%5.0f" format result.memoryParams.avgUsedHeapMemory,
+        "%5d" format result.memoryParams.maxHeapMemory,
         "%3.1fX" format (firstBest / result.bestMs))
     }
     out.println()
   }
 
-  private def getGcMetrics: (Any, Any) = {
-    val gcBeans = ManagementFactory.getGarbageCollectorMXBeans
-    val gcTime = gcBeans.map(_.getCollectionTime).sum
-    val gcCount = gcBeans.map(_.getCollectionCount).sum
-    (gcTime, gcCount)
-  }
-
   /**
    * Runs a single function `f` for iters, returning the average time the function took and
    * the rate of the function.
@@ -139,23 +133,31 @@ class Benchmark(
     val minIters = if (overrideNumIters != 0) overrideNumIters else minNumIters
     val runTimes = ArrayBuffer[Long]()
     var totalTime = 0L
+    //For tracking maximum GC over iterations
     var maxGcCount = 0L
     var maxGcTime = 0L
+    var totalFreeMemory = 0L
+    var totalUsedMemory = 0L
+    var maxHeapMemory = 0L
     for (i <- 0 until minIters) {
       val timer = new ToolsTimer(i)
-      val (gcTimeBefore, gcCountBefore) = getGcMetrics
+      val memoryTracker = new MemoryMetricsTracker
       f(timer)
       val runTime = timer.totalTime()
       runTimes += runTime
       totalTime += runTime
-      val (gcTimeAfter, gcCountAfter) = getGcMetrics
-      maxGcTime = math.max(maxGcTime, gcTimeAfter.asInstanceOf[Long] - gcTimeBefore.asInstanceOf[Long])
-      maxGcCount = math.max(maxGcCount, gcCountAfter.asInstanceOf[Long] - gcCountBefore.asInstanceOf[Long])
+      val gcCount = memoryTracker.getTotalGCCount
+      val gcTime = memoryTracker.getTotalGCTime
+      maxGcTime = math.max(maxGcTime, gcTime)
+      maxGcCount = math.max(maxGcCount, gcCount)
+      totalFreeMemory += memoryTracker.getCurrentFreeHeapMemory
+      totalUsedMemory += memoryTracker.getCurrentUsedHeapMemory
+      maxHeapMemory = if (maxHeapMemory == 0L) memoryTracker.getCurrentMaxHeapMemory else maxHeapMemory
       if (outputPerIteration) {
         // scalastyle:off
         println("*"*80)
         println(s"Iteration $i took ${NANOSECONDS.toMicros(runTime)} microseconds")
-        println(s"Iteration $i GC time ${gcTimeAfter.asInstanceOf[Long] - gcTimeBefore.asInstanceOf[Long]}")
+        println(s"Iteration $i GC time $gcTime ms, GC count $gcCount")
         println("*"*80)
         // scalastyle:on
       }
@@ -171,13 +173,17 @@ class Benchmark(
     val stdev = if (runTimes.size > 1) {
       math.sqrt(runTimes.map(time => (time - avg) * (time - avg)).sum / (runTimes.size - 1))
     } else 0
-    Benchmark.Result(avg / 1000000.0,  best / 1000000.0, stdev / 1000000.0, maxGcTime, maxGcCount)
+    Benchmark.Result(avg / 1000000.0,  best / 1000000.0, stdev / 1000000.0,
+      JVMMemoryParams(maxGcTime, maxGcCount, maxHeapMemory/ 1000000,
+        totalUsedMemory / (minIters*1000000), totalFreeMemory / (minIters*1000000)))
   }
 }
 
 
 object Benchmark {
   case class Case(name: String, fn: ToolsTimer => Unit, numIters: Int)
+  case class JVMMemoryParams( maxGCTime: Long, maxGCCount: Long, maxHeapMemory: Long,
+      avgUsedHeapMemory: Double, avgFreeHeapMemory: Double)
   case class Result(avgMs: Double, bestMs: Double, stdevMs: Double,
-      maxGcTime: Long = 0, maxGcCount: Long = 0)
+      memoryParams: JVMMemoryParams)
 }
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala
new file mode 100644
index 000000000..ff618191f
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala
@@ -0,0 +1,45 @@
+package org.apache.spark.sql.rapids.tool.util
+
+import java.lang.management.ManagementFactory
+
+import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
+
+/**
+ * Utility class to track memory metrics.
+ * This class is used to track memory metrics such as GC count, GC time,
+ * heap memory usage, etc.
+ *
+ */
+class MemoryMetricsTracker {
+  private val startGCMetrics = getCurrentGCMetrics
+
+  private def getCurrentGCMetrics: (Long, Long) = {
+    val gcBeans = ManagementFactory.getGarbageCollectorMXBeans
+
+    (gcBeans.map(_.getCollectionCount).sum,
+      gcBeans.map(_.getCollectionTime).sum)
+  }
+
+  def getTotalGCCount: Long = {
+    val (newGcCount:Long, _) = getCurrentGCMetrics
+    newGcCount - startGCMetrics._1
+  }
+
+  def getTotalGCTime: Long = {
+    val (_, newGcTime:Long) = getCurrentGCMetrics
+    newGcTime - startGCMetrics._2
+  }
+
+  def getCurrentMaxHeapMemory: Long = {
+    ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getMax
+  }
+
+  def getCurrentUsedHeapMemory: Long = {
+    ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
+  }
+
+  def getCurrentFreeHeapMemory: Long = {
+    ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getMax -
+      ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
+  }
+}

From f1664df4e155699e1a0b23f0fce1578207d397fc Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Wed, 3 Jul 2024 17:06:17 -0500
Subject: [PATCH 3/5] Correcting output format + refactoring

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../rapids/tool/benchmarks/Benchmark.scala    | 66 +++++++++----------
 ...scala => QualificationToolBenchmark.scala} | 17 +++--
 .../tool/util/MemoryMetricsTracker.scala      | 13 ----
 .../sql/rapids/tool/util/RuntimeUtil.scala    |  2 +-
 4 files changed, 44 insertions(+), 54 deletions(-)
 rename core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/{QualificationBenchmark.scala => QualificationToolBenchmark.scala} (75%)

diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
index 8ecf6b574..6da9cc6b1 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.rapids.tool.util.{MemoryMetricsTracker, RuntimeUtil,
  * This will output the average time to run each function and the rate of each function.
  */
 class Benchmark(
-    name: String,
+    name: String = "Benchmarker",
     valuesPerIteration: Long,
     minNumIters: Int,
     warmUpIterations: Int,
@@ -100,22 +100,23 @@ class Benchmark(
     val firstBest = results.head.bestMs
     // The results are going to be processor specific so it is useful to include that.
     out.println(RuntimeUtil.getJVMOSInfo.mkString("\n"))
+    out.println(s"MaxHeapMemory -> ${Runtime.getRuntime.maxMemory()} \n")
     val nameLen = Math.max(40, Math.max(name.length, benchmarks.map(_.name.length).max))
-    out.printf(s"%-${nameLen}s %14s %14s %11s %20s %20s %20s %20s %20s %10s\n",
-      name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)","Max GC Time(ms)",
-      "Max GC Count", "Avg Free Memory(MB)","Avg Used Memory(MB)", "Max Heap Memory(MB)", "Relative")
+    out.printf(s"%-${nameLen}s %14s %14s %11s %20s %18s %18s %18s %18s %10s\n",
+      name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)","Avg GC Time(ms)",
+      "Avg GC Count", "Stdev GC Count","Max GC Time(ms)","Max GC Count", "Relative")
     out.println("-" * (nameLen + 160))
     results.zip(benchmarks).foreach { case (result, benchmark) =>
-      out.printf(s"%-${nameLen}s %14s %14s %11s %20s %20s %20s %20s %20s %10s\n",
+      out.printf(s"%-${nameLen}s %14s %14s %11s %20s %18s %18s %18s %18s %10s\n",
         benchmark.name,
         "%5.0f" format result.bestMs,
         "%4.0f" format result.avgMs,
         "%5.0f" format result.stdevMs,
-        "%8d" format result.memoryParams.maxGCTime,
-        "%5d" format result.memoryParams.maxGCTime,
-        "%5.0f" format result.memoryParams.avgFreeHeapMemory,
-        "%5.0f" format result.memoryParams.avgUsedHeapMemory,
-        "%5d" format result.memoryParams.maxHeapMemory,
+        "%5.1f" format result.memoryParams.avgGCTime,
+        "%5.1f" format result.memoryParams.avgGCCount,
+        "%5.0f" format result.memoryParams.stdDevGCCount,
+        "%5d" format result.memoryParams.maxGcTime,
+        "%5d" format result.memoryParams.maxGCCount,
         "%3.1fX" format (firstBest / result.bestMs))
     }
     out.println()
@@ -132,32 +133,21 @@ class Benchmark(
     }
     val minIters = if (overrideNumIters != 0) overrideNumIters else minNumIters
     val runTimes = ArrayBuffer[Long]()
-    var totalTime = 0L
+    val gcCounts = ArrayBuffer[Long]()
+    val gcTimes = ArrayBuffer[Long]()
     //For tracking maximum GC over iterations
-    var maxGcCount = 0L
-    var maxGcTime = 0L
-    var totalFreeMemory = 0L
-    var totalUsedMemory = 0L
-    var maxHeapMemory = 0L
     for (i <- 0 until minIters) {
       val timer = new ToolsTimer(i)
       val memoryTracker = new MemoryMetricsTracker
       f(timer)
       val runTime = timer.totalTime()
       runTimes += runTime
-      totalTime += runTime
-      val gcCount = memoryTracker.getTotalGCCount
-      val gcTime = memoryTracker.getTotalGCTime
-      maxGcTime = math.max(maxGcTime, gcTime)
-      maxGcCount = math.max(maxGcCount, gcCount)
-      totalFreeMemory += memoryTracker.getCurrentFreeHeapMemory
-      totalUsedMemory += memoryTracker.getCurrentUsedHeapMemory
-      maxHeapMemory = if (maxHeapMemory == 0L) memoryTracker.getCurrentMaxHeapMemory else maxHeapMemory
+      gcCounts += memoryTracker.getTotalGCCount
+      gcTimes += memoryTracker.getTotalGCTime
       if (outputPerIteration) {
         // scalastyle:off
         println("*"*80)
         println(s"Iteration $i took ${NANOSECONDS.toMicros(runTime)} microseconds")
-        println(s"Iteration $i GC time $gcTime ms, GC count $gcCount")
         println("*"*80)
         // scalastyle:on
       }
@@ -168,22 +158,30 @@ class Benchmark(
     println("*"*80)
     // scalastyle:on
     assert(runTimes.nonEmpty)
-    val best = runTimes.min
-    val avg = runTimes.sum / runTimes.size
-    val stdev = if (runTimes.size > 1) {
-      math.sqrt(runTimes.map(time => (time - avg) * (time - avg)).sum / (runTimes.size - 1))
+    val bestRuntime = runTimes.min
+    val avgRuntime = runTimes.sum / runTimes.size
+    val stdevRunTime = if (runTimes.size > 1) {
+      math.sqrt(runTimes.map(time => (time - avgRuntime) *
+        (time - avgRuntime)).sum / (runTimes.size - 1))
     } else 0
-    Benchmark.Result(avg / 1000000.0,  best / 1000000.0, stdev / 1000000.0,
-      JVMMemoryParams(maxGcTime, maxGcCount, maxHeapMemory/ 1000000,
-        totalUsedMemory / (minIters*1000000), totalFreeMemory / (minIters*1000000)))
+    val maxGcCount = gcCounts.max
+    val stdevGcCount = if (gcCounts.size > 1) {
+      math.sqrt(gcCounts.map(gc => (gc - maxGcCount) *
+        (gc - maxGcCount)).sum / (gcCounts.size - 1))
+    } else 0
+    val avgGcCount = gcCounts.sum / minIters
+    val avgGcTime = gcTimes.sum / minIters
+    val maxGcTime = gcTimes.max
+    Benchmark.Result(avgRuntime / 1000000.0,  bestRuntime / 1000000.0, stdevRunTime / 1000000.0,
+      JVMMemoryParams(avgGcTime, avgGcCount, stdevGcCount, maxGcCount, maxGcTime))
   }
 }
 
 
 object Benchmark {
   case class Case(name: String, fn: ToolsTimer => Unit, numIters: Int)
-  case class JVMMemoryParams( maxGCTime: Long, maxGCCount: Long, maxHeapMemory: Long,
-      avgUsedHeapMemory: Double, avgFreeHeapMemory: Double)
+  case class JVMMemoryParams( avgGCTime:Double, avgGCCount:Double,
+      stdDevGCCount: Double, maxGCCount: Long, maxGcTime:Long)
   case class Result(avgMs: Double, bestMs: Double, stdevMs: Double,
       memoryParams: JVMMemoryParams)
 }
diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationBenchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationToolBenchmark.scala
similarity index 75%
rename from core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationBenchmark.scala
rename to core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationToolBenchmark.scala
index 17712739f..af1095f89 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationBenchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationToolBenchmark.scala
@@ -27,23 +27,28 @@ import com.nvidia.spark.rapids.tool.qualification.QualificationMain.mainInternal
  * 2. Write the benchmark code in the runBenchmark method passing relevant arguments
  * 3. Write benchmarked code inside
  */
-object QualificationBenchmark extends BenchmarkBase {
+object QualificationToolBenchmark extends BenchmarkBase {
   override def runBenchmarkSuite(iterations: Int,
     warmUpIterations: Int,
     outputFormat: String,
     mainArgs: Array[String]): Unit = {
-    runBenchmark("QualificationBenchmark") {
+    runBenchmark("Benchmark_Per_SQL_Arg_Qualification") {
       val benchmarker =
         new Benchmark(
-          "QualificationBenchmark",
-          2,
+          valuesPerIteration = 2,
           output = output,
           outputPerIteration = true,
           warmUpIterations = warmUpIterations,
           minNumIters = iterations)
-      benchmarker.addCase("QualificationBenchmark") { _ =>
+      benchmarker.addCase("Enable_Per_SQL_Arg_Qualification") { _ =>
+        val (prefix,suffix) = mainArgs.splitAt(mainArgs.length - 1)
+
+        mainInternal(new QualificationArgs(prefix :+ "--per-sql" :+ suffix.head),
+          enablePB = true)
+      }
+      benchmarker.addCase("Disable_Per_SQL_Arg_Qualification") { _ =>
         mainInternal(new QualificationArgs(mainArgs),
-          printStdout = true, enablePB = true)
+          enablePB = true)
       }
       benchmarker.run()
     }
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala
index ff618191f..c796e413c 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/MemoryMetricsTracker.scala
@@ -29,17 +29,4 @@ class MemoryMetricsTracker {
     val (_, newGcTime:Long) = getCurrentGCMetrics
     newGcTime - startGCMetrics._2
   }
-
-  def getCurrentMaxHeapMemory: Long = {
-    ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getMax
-  }
-
-  def getCurrentUsedHeapMemory: Long = {
-    ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
-  }
-
-  def getCurrentFreeHeapMemory: Long = {
-    ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getMax -
-      ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/RuntimeUtil.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/RuntimeUtil.scala
index abe2b55cd..d39323c7a 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/RuntimeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/RuntimeUtil.scala
@@ -68,7 +68,7 @@ object RuntimeUtil extends Logging {
   def getJVMOSInfo: Map[String, String] = {
     Map(
       "jvm.name" -> System.getProperty("java.vm.name"),
-      "jvm.version" -> System.getProperty("java.vm.version"),
+      "jvm.version" -> System.getProperty("java.version"),
       "os.name" -> System.getProperty("os.name"),
       "os.version" -> System.getProperty("os.version")
     )

From 58a7d9f89e869a0ede0bb58661ac46aa26cf44eb Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Fri, 5 Jul 2024 13:13:04 -0500
Subject: [PATCH 4/5] Output Formatting Changes

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../rapids/tool/benchmarks/Benchmark.scala    | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
index 6da9cc6b1..36abc3a6c 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
@@ -99,8 +99,14 @@ class Benchmark(
 
     val firstBest = results.head.bestMs
     // The results are going to be processor specific so it is useful to include that.
-    out.println(RuntimeUtil.getJVMOSInfo.mkString("\n"))
-    out.println(s"MaxHeapMemory -> ${Runtime.getRuntime.maxMemory()} \n")
+    val jvmInfo = RuntimeUtil.getJVMOSInfo
+    out.printf(s"%-26s ->   %-30s \n","JVM Name", jvmInfo("jvm.name"))
+    out.printf(s"%-26s ->   %-30s \n","Java Version",jvmInfo("jvm.version"))
+    out.printf(s"%-26s ->   %-30s \n","OS Name",jvmInfo("os.name"))
+    out.printf(s"%-26s ->   %-30s \n","OS Version",jvmInfo("os.version"))
+    out.printf(s"%-26s ->   %-30s \n","MaxHeapMemory", ("%6d" format Runtime.getRuntime.maxMemory()/1024/1024)+"MB")
+    out.printf(s"%-26s ->   %-30s \n","Total Warm Up Iterations","%2d" format warmUpIterations)
+    out.printf(s"%-26s ->   %-30s \n \n","Total Runtime Iterations","%2d" format minNumIters)
     val nameLen = Math.max(40, Math.max(name.length, benchmarks.map(_.name.length).max))
     out.printf(s"%-${nameLen}s %14s %14s %11s %20s %18s %18s %18s %18s %10s\n",
       name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)","Avg GC Time(ms)",
@@ -117,7 +123,7 @@ class Benchmark(
         "%5.0f" format result.memoryParams.stdDevGCCount,
         "%5d" format result.memoryParams.maxGcTime,
         "%5d" format result.memoryParams.maxGCCount,
-        "%3.1fX" format (firstBest / result.bestMs))
+        "%3.2fX" format (firstBest / result.bestMs))
     }
     out.println()
   }
@@ -163,12 +169,16 @@ class Benchmark(
     val stdevRunTime = if (runTimes.size > 1) {
       math.sqrt(runTimes.map(time => (time - avgRuntime) *
         (time - avgRuntime)).sum / (runTimes.size - 1))
-    } else 0
+    } else {
+      0
+    }
     val maxGcCount = gcCounts.max
     val stdevGcCount = if (gcCounts.size > 1) {
       math.sqrt(gcCounts.map(gc => (gc - maxGcCount) *
         (gc - maxGcCount)).sum / (gcCounts.size - 1))
-    } else 0
+    } else {
+      0
+    }
     val avgGcCount = gcCounts.sum / minIters
     val avgGcTime = gcTimes.sum / minIters
     val maxGcTime = gcTimes.max

From 94f7f935c0d3135ca96bd24e06504d13a0130b3b Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Fri, 5 Jul 2024 13:59:59 -0500
Subject: [PATCH 5/5] Formatting + Making qual bench single threaded

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../spark/rapids/tool/benchmarks/Benchmark.scala   | 14 +++++++-------
 ...scala => SingleThreadedQualToolBenchmark.scala} | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)
 rename core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/{QualificationToolBenchmark.scala => SingleThreadedQualToolBenchmark.scala} (87%)

diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
index 36abc3a6c..40dca59a9 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/Benchmark.scala
@@ -100,13 +100,13 @@ class Benchmark(
     val firstBest = results.head.bestMs
     // The results are going to be processor specific so it is useful to include that.
     val jvmInfo = RuntimeUtil.getJVMOSInfo
-    out.printf(s"%-26s ->   %-30s \n","JVM Name", jvmInfo("jvm.name"))
-    out.printf(s"%-26s ->   %-30s \n","Java Version",jvmInfo("jvm.version"))
-    out.printf(s"%-26s ->   %-30s \n","OS Name",jvmInfo("os.name"))
-    out.printf(s"%-26s ->   %-30s \n","OS Version",jvmInfo("os.version"))
-    out.printf(s"%-26s ->   %-30s \n","MaxHeapMemory", ("%6d" format Runtime.getRuntime.maxMemory()/1024/1024)+"MB")
-    out.printf(s"%-26s ->   %-30s \n","Total Warm Up Iterations","%2d" format warmUpIterations)
-    out.printf(s"%-26s ->   %-30s \n \n","Total Runtime Iterations","%2d" format minNumIters)
+    out.printf(s"%-26s :   %s \n","JVM Name", jvmInfo("jvm.name"))
+    out.printf(s"%-26s :   %s \n","Java Version", jvmInfo("jvm.version"))
+    out.printf(s"%-26s :   %s \n","OS Name", jvmInfo("os.name"))
+    out.printf(s"%-26s :   %s \n","OS Version", jvmInfo("os.version"))
+    out.printf(s"%-26s :   %s MB \n","MaxHeapMemory", (Runtime.getRuntime.maxMemory()/1024/1024).toString)
+    out.printf(s"%-26s :   %s \n","Total Warm Up Iterations", warmUpIterations.toString)
+    out.printf(s"%-26s :   %s \n \n","Total Runtime Iterations", minNumIters.toString)
     val nameLen = Math.max(40, Math.max(name.length, benchmarks.map(_.name.length).max))
     out.printf(s"%-${nameLen}s %14s %14s %11s %20s %18s %18s %18s %18s %10s\n",
       name + ":", "Best Time(ms)", "Avg Time(ms)", "Stdev(ms)","Avg GC Time(ms)",
diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationToolBenchmark.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/SingleThreadedQualToolBenchmark.scala
similarity index 87%
rename from core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationToolBenchmark.scala
rename to core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/SingleThreadedQualToolBenchmark.scala
index af1095f89..12b3f3c6d 100644
--- a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/QualificationToolBenchmark.scala
+++ b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/SingleThreadedQualToolBenchmark.scala
@@ -27,7 +27,7 @@ import com.nvidia.spark.rapids.tool.qualification.QualificationMain.mainInternal
  * 2. Write the benchmark code in the runBenchmark method passing relevant arguments
  * 3. Write benchmarked code inside
  */
-object QualificationToolBenchmark extends BenchmarkBase {
+object SingleThreadedQualToolBenchmark extends BenchmarkBase {
   override def runBenchmarkSuite(iterations: Int,
     warmUpIterations: Int,
     outputFormat: String,
@@ -40,14 +40,14 @@ object QualificationToolBenchmark extends BenchmarkBase {
           outputPerIteration = true,
           warmUpIterations = warmUpIterations,
           minNumIters = iterations)
+      val (prefix,suffix) = mainArgs.splitAt(mainArgs.length - 1)
       benchmarker.addCase("Enable_Per_SQL_Arg_Qualification") { _ =>
-        val (prefix,suffix) = mainArgs.splitAt(mainArgs.length - 1)
-
-        mainInternal(new QualificationArgs(prefix :+ "--per-sql" :+ suffix.head),
+        mainInternal(new QualificationArgs(prefix :+ "--per-sql" :+ "--num-threads"
+          :+ "1" :+ suffix.head),
           enablePB = true)
       }
       benchmarker.addCase("Disable_Per_SQL_Arg_Qualification") { _ =>
-        mainInternal(new QualificationArgs(mainArgs),
+        mainInternal(new QualificationArgs(prefix :+ "--num-threads" :+ "1" :+ suffix.head),
           enablePB = true)
       }
       benchmarker.run()