diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
index b9b9e11c9..12bcb9d79 100644
--- a/.github/workflows/mvn-verify-check.yml
+++ b/.github/workflows/mvn-verify-check.yml
@@ -22,7 +22,9 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-
+    strategy:
+      matrix:
+        spark-version: ['311', '320', '330', '341']
     steps:
     - uses: actions/checkout@v3
 
@@ -32,5 +34,5 @@ jobs:
         distribution: adopt
         java-version: 8
 
-    - name: Run mvn verify
-      run: cd core && mvn verify
+    - name: Run mvn verify with Spark ${{ matrix.spark-version }}
+      run: cd core && mvn -Dbuildver=${{ matrix.spark-version }} verify
diff --git a/.github/workflows/python-unit-test.yml b/.github/workflows/python-unit-test.yml
index e10911978..f29f3caea 100644
--- a/.github/workflows/python-unit-test.yml
+++ b/.github/workflows/python-unit-test.yml
@@ -22,20 +22,23 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10']
 
     steps:
-    - uses: actions/checkout@v3
+      - uses: actions/checkout@v3
 
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: '3.8'
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
 
-    - name: Install tox
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install tox
-        python -m pip install --pre tox-gh-actions
+      - name: Install tox
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install tox
+          python -m pip install --pre tox-gh-actions
 
-    - name: Run tox test
-      run: cd user_tools && tox -e pylint,flake8,python3.8 -- tests/test_diagnostic.py
+      - name: Run tox test
+        run: cd user_tools && tox
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e6b776e4f..002bf95ba 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -16,7 +16,9 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.NVAUTO_TOKEN }}
 
       - name: Build Changelog
         id: build_changelog
@@ -25,7 +27,7 @@ jobs:
           configuration: ".github/workflows/configuration.json" # Configuration file for the changelog builder (optional)z
           outputFile: "CHANGELOG_BODY.md"
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.NVAUTO_TOKEN }}
 
       - name: Commit and Push Changelog
         if: steps.build_changelog.outputs.changes > 0
@@ -38,15 +40,15 @@ jobs:
           cat CURRENT_CHANGELOG.md CHANGELOG_BODY.md >> TEMP_CHANGELOG.md
           cat TEMP_CHANGELOG.md CHANGELOG.md > NEW_CHANGELOG.md
 
-          git config user.name "GitHub Actions"
-          git config user.email "actions@github.com"
+          git config user.name ${{ secrets.NVAUTO_USER }}
+          git config user.email ${{ secrets.NVAUTO_EMAIL }}
 
           git fetch origin main
           git checkout main
           mv NEW_CHANGELOG.md CHANGELOG.md
           git add CHANGELOG.md
-          git commit -m "Update changelogs"
-          git push origin main
+          git commit -s -m "Update changelogs"
+          git push -f https://nvauto:${{ secrets.NVAUTO_TOKEN }}@github.com/${GITHUB_REPOSITORY}.git main
 
       - name: Set Version Number
         id: set_version
diff --git a/core/pom.xml b/core/pom.xml
index 14af8997a..0a93b6cea 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-tools_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark tools</name>
     <description>RAPIDS Accelerator for Apache Spark tools</description>
-    <version>23.06.4-SNAPSHOT</version>
+    <version>23.08.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <url>http://github.com/NVIDIA/spark-rapids-tools</url>
 
@@ -327,7 +327,7 @@
         <spark332.version>3.3.2</spark332.version>
         <spark333.version>3.3.3-SNAPSHOT</spark333.version>
         <spark340.version>3.4.0</spark340.version>
-        <spark341.version>3.4.1-SNAPSHOT</spark341.version>
+        <spark341.version>3.4.1</spark341.version>
         <spark350.version>3.5.0-SNAPSHOT</spark350.version>
         <scala.binary.version>2.12</scala.binary.version>
         <scala.plugin.version>4.3.0</scala.plugin.version>
@@ -361,7 +361,7 @@
         <delta21x.version>2.1.1</delta21x.version>
         <delta22x.version>2.2.0</delta22x.version>
         <delta23x.version>2.3.0</delta23x.version>
-        <delta24x.version>2.4.0rc1</delta24x.version>
+        <delta24x.version>2.4.0</delta24x.version>
         <delta.core.version>${delta10x.version}</delta.core.version>
         <java.version>1.8</java.version>
         <maven-compiler-plugin.version>3.11.0</maven-compiler-plugin.version>
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala
index 8af8dacb2..6a974cf21 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala
@@ -763,8 +763,6 @@ class AutoTuner(
           && appInfoProvider.getRedundantReadSize > DEF_READ_SIZE_THRESHOLD) {
       appendRecommendation("spark.rapids.filecache.enabled", "true")
       appendComment("Enable file cache only if Spark local disks bandwidth is > 1 GB/s")
-    } else {
-      null
     }
   }
 
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala
index e20affa40..59c55a58e 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala
@@ -1154,7 +1154,16 @@ class QualificationSuite extends BaseTestSuite {
         try {
           val lines = inputSource.getLines.toSeq
           // 1 for header, 1 for values
-          assert(lines.size == 6)
+
+          val expLinesSize =
+            if (ToolUtils.isSpark340OrLater()) {
+              8
+            } else if (!ToolUtils.isSpark320OrLater()) {
+              6
+            } else {
+              7
+            }
+          assert(lines.size == expLinesSize)
           assert(lines.head.contains("App ID,Unsupported Type,"))
           assert(lines(1).contains("\"Read\",\"JSON\",\"Types not supported - bigint:int\""))
         } finally {
diff --git a/user_tools/custom_speedup_factors/README.md b/user_tools/custom_speedup_factors/README.md
index 118c81099..d2f3afa2d 100644
--- a/user_tools/custom_speedup_factors/README.md
+++ b/user_tools/custom_speedup_factors/README.md
@@ -24,16 +24,16 @@ spark_rapids_user_tools onprem profiling --csv --eventlogs CPU-3k --local_folder
 spark_rapids_user_tools onprem profiling --csv --eventlogs GPU-3k --local_folder GPU-3k-profile
 ```
 3. Speedup factor generation
-   1. Run the speedup factor generation script, passing the CPU and GPU profiler output.  
+   1. Run the speedup factor generation script, passing the CPU and GPU profiler output along with a CSV output filename.
 ```
-python generate_speedup_factors.py --cpu CPU-3k-profile/rapids_4_spark_profile --gpu GPU-3k-profile/rapids_4_spark_profile
+python generate_speedup_factors.py --cpu CPU-3k-profile/rapids_4_spark_profile --gpu GPU-3k-profile/rapids_4_spark_profile --output newScores.csv
 ```
 
-The output will showcase what operators were detected in the benchmarks to be used as custom speedups.  You can then update values from the default [operatorsScore.csv](https://github.com/NVIDIA/spark-rapids-tools/blob/dev/core/src/main/resources/operatorsScore.csv) file to create your own version with the custom speedup factors generated by the output.
+The script will generate the new scores in the output specified by the `--output` argument.
 
 ## Running Workload Qualification with Custom Speedup Factors
 
 Now that you have a custom *operatorsScore.csv* file, you can run the Spark RAPIDS qualification tool using it to get estimations applicable for your environment.  Here is the command to run with a custom speedup factor file:
 ```
-spark_rapids_user_tools onprem qualification --speedup-factor-file operatorsScore.csv --eventlogs <CPU-event-logs>
+spark_rapids_user_tools onprem qualification --speedup-factor-file newScores.csv --eventlogs <CPU-event-logs>
 ```
diff --git a/user_tools/custom_speedup_factors/defaultScores.csv b/user_tools/custom_speedup_factors/defaultScores.csv
new file mode 100644
index 000000000..b958090b8
--- /dev/null
+++ b/user_tools/custom_speedup_factors/defaultScores.csv
@@ -0,0 +1,18 @@
+CPUOperator,Score
+AggregateInPandasExec,1.2
+ArrowEvalPythonExec,1.2
+FlatMapGroupsInPandasExec,1.2
+MapInPandasExec,1.2
+WindowInPandasExec,1.2
+KMeans-pyspark,8.86
+KMeans-scala,1
+PCA-pyspark,2.24
+PCA-scala,2.69
+LinearRegression-pyspark,2
+LinearRegression-scala,1
+RandomForestClassifier-pyspark,6.31
+RandomForestClassifier-scala,1
+RandomForestRegressor-pyspark,3.66
+RandomForestRegressor-scala,1
+XGBoost-pyspark,1
+XGBoost-scala,3.31
diff --git a/user_tools/custom_speedup_factors/generate_speedup_factors.py b/user_tools/custom_speedup_factors/generate_speedup_factors.py
index 5224c588a..8ce04e566 100644
--- a/user_tools/custom_speedup_factors/generate_speedup_factors.py
+++ b/user_tools/custom_speedup_factors/generate_speedup_factors.py
@@ -27,12 +27,14 @@
 parser = argparse.ArgumentParser(description="Speedup Factor Analysis")
 parser.add_argument("--cpu", type=str, help="Directory of CPU profiler logs", required=True)
 parser.add_argument("--gpu", type=str, help="Directory of GPU profiler logs", required=True)
+parser.add_argument("--output", type=str, help="Filename for custom speedup factors", required=True)
 parser.add_argument("--verbose", action="store_true", help="flag to generate full verbose output for logging raw node results")
 parser.add_argument("--chdir", action="store_true", help="flag to change to work dir that's the script located")
 args = parser.parse_args()
 
 cpu_dir = args.cpu
 gpu_dir = args.gpu
+output = args.output
 verbose = args.verbose
 
 cpu_stage_log = {}
@@ -55,13 +57,26 @@
     mapping_info = mapping_info.groupby(['SQL Node'])['Child Node'].apply(','.join).reset_index()
 
     # - process sql_plan_metrics_for_application.csv
-    #   - load in "duration" (CPU) or "op time" (GPU)
+    #   - load in "duration" (CPU)
     #   - replace WholeStageCodegen (CPU only) with list of operators from mapping lookup file
     #     - mapping_info.parent = sql_times.nodeName
     cpu_sql_info = pd.read_csv(cpu_dir + "/" + app + "/sql_plan_metrics_for_application.csv")
     cpu_sql_times = cpu_sql_info[cpu_sql_info["name"] == "duration"]
     cpu_sql_combined = cpu_sql_times.set_index('nodeName').join(mapping_info.set_index('SQL Node'), how='left')
 
+    #  - parse WholeStageCodegen durations with child node mapping
+    cpu_sql_times_df = cpu_sql_combined[['Child Node', 'max_value']]
+
+    for index, row in cpu_sql_times_df.iterrows():
+        operators = str(row['Child Node']).split(',')
+        duration = row['max_value']/len(operators)/1000.0
+        for operator in operators:
+            if operator in cpu_stage_log[app_name]:
+                cpu_stage_log[app_name][operator] = cpu_stage_log[app_name][operator] + duration
+            else:
+                cpu_stage_log[app_name][operator] = duration
+
+    # - parse top-level execs from sql_to_stage_information.csv
     cpu_stage_info = pd.read_csv(cpu_dir + "/" + app + "/sql_to_stage_information.csv")
     cpu_stage_times = cpu_stage_info[['Stage Duration', 'SQL Nodes(IDs)']]
 
@@ -92,12 +107,8 @@
     app_name = app_info.loc[0]["appName"]
     gpu_stage_log[app_name] = {}
 
-    # - process sql_plan_metrics_for_application.csv
-    #   - load in "duration" (CPU) or "op time" (GPU)
-    #     - mapping_info.parent = sql_times.nodeName
-    gpu_sql_info = pd.read_csv(gpu_dir + "/" + app + "/sql_plan_metrics_for_application.csv")
-    gpu_sql_times = gpu_sql_info[gpu_sql_info["name"] == "op time"]
-
+    # - process sql_to_stage_information.csv to get stage durations
+    # - split up duration by operators listed in each stage
     gpu_stage_info = pd.read_csv(gpu_dir + "/" + app + "/sql_to_stage_information.csv")
     gpu_stage_times = gpu_stage_info[['Stage Duration', 'SQL Nodes(IDs)']]
 
@@ -111,41 +122,76 @@
             else:
                 gpu_stage_log[app_name][op_key] = duration
 
-# Sum up SQL operators for each
-stage_totals = {}
+cpu_stage_totals = {}
+gpu_stage_totals = {}
 cpu_stage_total = 0.0
 gpu_stage_total = 0.0
 
+# Sum up SQL operators for each operator found in CPU and GPU
 for app_key in cpu_stage_log:
     for op_key in cpu_stage_log[app_key]:
-        if op_key not in stage_totals:
-            stage_totals[op_key] = cpu_stage_log[app_key][op_key]
+        if op_key not in cpu_stage_totals:
+            cpu_stage_totals[op_key] = cpu_stage_log[app_key][op_key]
         else:
-            stage_totals[op_key] = stage_totals[op_key] + cpu_stage_log[app_key][op_key]
+            cpu_stage_totals[op_key] = cpu_stage_totals[op_key] + cpu_stage_log[app_key][op_key]
         cpu_stage_total = cpu_stage_total + cpu_stage_log[app_key][op_key]
 
 
 for app_key in gpu_stage_log:
     for op_key in gpu_stage_log[app_key]:
-        if op_key not in stage_totals:
-            stage_totals[op_key] = gpu_stage_log[app_key][op_key]
+        if op_key not in gpu_stage_totals:
+            gpu_stage_totals[op_key] = gpu_stage_log[app_key][op_key]
         else:
-            stage_totals[op_key] = stage_totals[op_key] + gpu_stage_log[app_key][op_key]
+            gpu_stage_totals[op_key] = gpu_stage_totals[op_key] + gpu_stage_log[app_key][op_key]
         gpu_stage_total = gpu_stage_total + gpu_stage_log[app_key][op_key]
 
+# Create dictionary of execs where speedup factors can be calculated
+scores_dict = {}
+
+if 'Filter' in cpu_stage_totals and 'GpuFilter' in gpu_stage_totals:
+    scores_dict["FilterExec"] = str(round(cpu_stage_totals['Filter'] / gpu_stage_totals['GpuFilter'], 2))
+if 'SortMergeJoin' in cpu_stage_totals and 'GpuShuffledHashJoin' in gpu_stage_totals:
+    scores_dict["SortExec"] = str(round(cpu_stage_totals['SortMergeJoin'] / gpu_stage_totals['GpuShuffledHashJoin'], 2))
+if 'BroadcastHashJoin' in cpu_stage_totals and 'GpuBroadcastHashJoin' in gpu_stage_totals:
+    scores_dict["BroadcastHashJoinExec"] = str(round(cpu_stage_totals['BroadcastHashJoin'] / gpu_stage_totals['GpuBroadcastHashJoin'], 2))
+if 'Exchange' in cpu_stage_totals and 'GpuColumnarExchange' in gpu_stage_totals:
+    scores_dict["ShuffleExchangeExec"] = str(round(cpu_stage_totals['Exchange'] / gpu_stage_totals['GpuColumnarExchange'], 2))
+if 'HashAggregate' in cpu_stage_totals and 'GpuHashAggregate' in gpu_stage_totals:
+    scores_dict["HashAggregateExec"] = str(round(cpu_stage_totals['HashAggregate'] / gpu_stage_totals['GpuHashAggregate'], 2))
+if all(cpu_keys in cpu_stage_totals for cpu_keys in ('SortMergeJoin', 'Sort' )) and all(gpu_keys in gpu_stage_totals for gpu_keys in ('GpuShuffledHashJoin', 'GpuSort')):
+    scores_dict["SortMergeJoinExec"] = str(round((cpu_stage_totals['SortMergeJoin'] + cpu_stage_totals['Sort']) / (gpu_stage_totals['GpuShuffledHashJoin'] + gpu_stage_totals['GpuSort']), 2))
+
+overall_speedup = str(round(cpu_stage_total/gpu_stage_total, 2))
+
 # Print out node metrics (if verbose)
 if verbose:
-    print("# Operator metrics ")
-    for key in stage_totals:
-        print(key + "," + str(stage_totals[key]))
-    print("CPU Total," + str(cpu_stage_total))
-    print("GPU Total," + str(gpu_stage_total))
-
-# Print out speedup factors
-print("# Speedup Factors ")
-print("FilterExec," + str(round(stage_totals['Filter'] / stage_totals['GpuFilter'], 2)))
-print("SortExec," + str(round(stage_totals['SortMergeJoin'] / stage_totals['GpuShuffledHashJoin'], 2)))
-print("BroadcastHashJoinExec," + str(round(stage_totals['BroadcastHashJoin'] / stage_totals['GpuBroadcastHashJoin'], 2)))
-print("ShuffleExchangeExec," + str(round(stage_totals['Exchange'] / stage_totals['GpuColumnarExchange'], 2)))
-print("HashAggregateExec," + str(round(stage_totals['HashAggregate'] / stage_totals['GpuHashAggregate'], 2)))
-print("SortMergeJoinExec," + str(round((stage_totals['SortMergeJoin']+stage_totals['Sort']) / (stage_totals['GpuShuffledHashJoin']+stage_totals['GpuSort']), 2)))
+    print("# CPU Operator Metrics")
+    for key in cpu_stage_totals:
+        print(key + " = " + str(cpu_stage_totals[key]))
+    print("# GPU Operator Metrics")
+    for key in gpu_stage_totals:
+        print(key + " = " + str(gpu_stage_totals[key]))
+    print("# Summary Metrics")
+    print("CPU Total = " + str(cpu_stage_total))
+    print("GPU Total = " + str(gpu_stage_total))
+    print("Overall speedup = " + overall_speedup)
+
+    # Print out individual exec speedup factors
+    print("# Speedup Factors ")
+    for key in scores_dict:
+        print(f"{key} = {scores_dict[key]}")
+
+# Load in list of operators and set initial values to default speedup
+scores_df = pd.read_csv("operatorsList.csv")
+scores_df["Score"] = overall_speedup
+
+# Update operators that are found in benchmark
+for key in scores_dict:
+    scores_df.loc[scores_df['CPUOperator'] == key, 'Score'] = scores_dict[key]
+
+# Add in hard-coded defaults
+defaults_df = pd.read_csv("defaultScores.csv")
+
+# Generate output CSV file
+final_df = pd.concat([scores_df, defaults_df])
+final_df.to_csv(output, index=False)
diff --git a/user_tools/custom_speedup_factors/operatorsList.csv b/user_tools/custom_speedup_factors/operatorsList.csv
new file mode 100644
index 000000000..9e04f5c67
--- /dev/null
+++ b/user_tools/custom_speedup_factors/operatorsList.csv
@@ -0,0 +1,247 @@
+CPUOperator
+CoalesceExec
+CollectLimitExec
+ExpandExec
+FileSourceScanExec
+FilterExec
+GenerateExec
+GlobalLimitExec
+LocalLimitExec
+ProjectExec
+RangeExec
+SampleExec
+SortExec
+SubqueryBroadcastExec
+TakeOrderedAndProjectExec
+UnionExec
+CustomShuffleReaderExec
+HashAggregateExec
+ObjectHashAggregateExec
+SortAggregateExec
+InMemoryTableScanExec
+DataWritingCommandExec
+ExecutedCommandExec
+BatchScanExec
+BroadcastExchangeExec
+ShuffleExchangeExec
+BroadcastHashJoinExec
+BroadcastNestedLoopJoinExec
+CartesianProductExec
+ShuffledHashJoinExec
+SortMergeJoinExec
+FlatMapCoGroupsInPandasExec
+WindowExec
+HiveTableScanExec
+Abs
+Acos
+Acosh
+Add
+AggregateExpression
+Alias
+And
+ApproximatePercentile
+ArrayContains
+ArrayExcept
+ArrayExists
+ArrayIntersect
+ArrayMax
+ArrayMin
+ArrayRemove
+ArrayRepeat
+ArrayTransform
+ArrayUnion
+ArraysOverlap
+ArraysZip
+Asin
+Asinh
+AtLeastNNonNulls
+Atan
+Atanh
+AttributeReference
+Average
+BRound
+BitLength
+BitwiseAnd
+BitwiseNot
+BitwiseOr
+BitwiseXor
+CaseWhen
+Cbrt
+Ceil
+CheckOverflow
+Coalesce
+CollectList
+CollectSet
+Concat
+ConcatWs
+Contains
+Cos
+Cosh
+Cot
+Count
+CreateArray
+CreateMap
+CreateNamedStruct
+CurrentRow$
+DateAdd
+DateAddInterval
+DateDiff
+DateFormatClass
+DateSub
+DayOfMonth
+DayOfWeek
+DayOfYear
+DenseRank
+Divide
+ElementAt
+EndsWith
+EqualNullSafe
+EqualTo
+Exp
+Explode
+Expm1
+First
+Floor
+FromUTCTimestamp
+FromUnixTime
+GetArrayItem
+GetArrayStructFields
+GetJsonObject
+GetMapValue
+GetStructField
+GetTimestamp
+GreaterThan
+GreaterThanOrEqual
+Greatest
+HiveGenericUDF
+HiveSimpleUDF
+Hour
+Hypot
+If
+In
+InSet
+InitCap
+InputFileBlockLength
+InputFileBlockStart
+InputFileName
+IntegralDivide
+IsNaN
+IsNotNull
+IsNull
+KnownFloatingPointNormalized
+KnownNotNull
+Lag
+LambdaFunction
+Last
+LastDay
+Lead
+Least
+Length
+LessThan
+LessThanOrEqual
+Like
+Literal
+Log
+Log10
+Log1p
+Log2
+Logarithm
+Lower
+MakeDecimal
+MapConcat
+MapEntries
+MapFilter
+MapKeys
+MapValues
+Max
+Md5
+Min
+Minute
+MonotonicallyIncreasingID
+Month
+Multiply
+Murmur3Hash
+NaNvl
+NamedLambdaVariable
+NormalizeNaNAndZero
+Not
+NthValue
+OctetLength
+Or
+PercentRank
+PivotFirst
+Pmod
+PosExplode
+Pow
+PreciseTimestampConversion
+PromotePrecision
+PythonUDF
+Quarter
+RLike
+RaiseError
+Rand
+Rank
+RegExpExtract
+RegExpExtractAll
+RegExpReplace
+Remainder
+ReplicateRows
+Reverse
+Rint
+Round
+RowNumber
+ScalaUDF
+ScalarSubquery
+Second
+Sequence
+ShiftLeft
+ShiftRight
+ShiftRightUnsigned
+Signum
+Sin
+Sinh
+Size
+SortArray
+SortOrder
+SparkPartitionID
+SpecifiedWindowFrame
+Sqrt
+StartsWith
+StddevPop
+StddevSamp
+StringInstr
+StringLPad
+StringLocate
+StringRPad
+StringRepeat
+StringReplace
+StringSplit
+StringToMap
+StringTrim
+StringTrimLeft
+StringTrimRight
+Substring
+SubstringIndex
+Subtract
+Sum
+Tan
+Tanh
+TimeAdd
+ToDegrees
+ToRadians
+ToUnixTimestamp
+TransformKeys
+TransformValues
+UnaryMinus
+UnaryPositive
+UnboundedFollowing$
+UnboundedPreceding$
+UnixTimestamp
+UnscaledValue
+Upper
+VariancePop
+VarianceSamp
+WeekDay
+WindowExpression
+WindowSpecDefinition
+Year
diff --git a/user_tools/docs/index.md b/user_tools/docs/index.md
index d37d95ead..48481b004 100644
--- a/user_tools/docs/index.md
+++ b/user_tools/docs/index.md
@@ -74,7 +74,8 @@ The following table summarizes the commands supported for each cloud platform:
 | EMR              | qualification | spark_rapids_user_tools \               |  23.02+  |
 |                  |               |   emr qualification [ARGS]              |          |
 |                  +---------------+-----------------------------------------+----------+
-|                  | profiling     |               N/A                       |    TBD   |
+|                  | profiling     | spark_rapids_user_tools \               |  23.08+  |
+|                  |               |   emr profiling [ARGS]                  |          |
 |                  +---------------+-----------------------------------------+----------+
 |                  | bootstrap     | spark_rapids_user_tools \               |  23.02+  |
 |                  |               |   emr bootstrap [ARGS]                  |          |
diff --git a/user_tools/docs/user-tools-aws-emr.md b/user_tools/docs/user-tools-aws-emr.md
index d65f6fe91..da464f591 100644
--- a/user_tools/docs/user-tools-aws-emr.md
+++ b/user_tools/docs/user-tools-aws-emr.md
@@ -161,6 +161,148 @@ The command creates a directory with UUID that contains the following:
         └── ui
     ```
 
+## Profiling command
+
+### Local deployment
+
+```
+spark_rapids_user_tools emr profiling [options]
+spark_rapids_user_tools emr profiling -- --help
+```
+
+The local deployment runs on the local development machine. It requires:
+1. Installing and configuring the AWS CLI
+2. Java 1.8+ development environment
+3. Internet access to download JAR dependencies from mvn: `spark-*.jar`, `hadoop-aws-*.jar`, and `aws-java-sdk-bundle*.jar`
+4. Dependencies are cached on the local disk to reduce the overhead of the download.
+
+#### Command options
+
+| Option               | Description                                                                                                                                                                                                                                               | Default                                                                                                                                                                                                                       | Required |
+|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
+| **gpu_cluster**      | The EMR-cluster on which the Spark applications were executed. The argument can be an EMR-cluster or a valid path to the cluster's properties file (json format) generated by the AWS CLI command  `aws emr describe-cluster`                             | If missing, then the argument worker_info has to be provided.                                                                                                                                                                 | N        |
+| **worker_info**      | A path pointing to a yaml file containing the system information of a worker node. It is assumed that all workers are homogenous. The format of the file is described in the following section.                                                           | None                                                                                                                                                                                                                          | N        |
+| **eventlogs**        | A comma seperated list of S3 urls pointing to event logs or S3 directory                                                                                                                                                                                  | Reads the Spark's property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included in the output of `emr describe-cluster`. Note that the wrapper will raise an exception if the property is not set. | N        |
+| **remote_folder**    | The S3 folder where the output of the wrapper's output is copied. If missing, the output will be available only on local disk                                                                                                                             | N/A                                                                                                                                                                                                                           | N        |
+| **local_folder**     | Local work-directory path to store the output and to be used as root directory for temporary folders/files. The final output will go into a subdirectory named `prof-${EXEC_ID}` where `exec_id` is an auto-generated unique identifier of the execution. | If the argument is NONE, the default value is the env variable `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY` if any; or the current working directory.                                                                                 | N        |
+| **profile**          | A named AWS profile to get the settings/credentials of the AWS account.                                                                                                                                                                                   | "DEFAULT"                                                                                                                                                                                                                     | N        |
+| **jvm_heap_size**    | The maximum heap size of the JVM in gigabytes                                                                                                                                                                                                             | 24                                                                                                                                                                                                                            | N        |
+| **tools_jar**        | Path to a bundled jar including RAPIDS tool. The path is a local filesystem, or remote S3 url                                                                                                                                                             | Downloads the latest `rapids-4-spark-tools_*.jar` from mvn repo                                                                                                                                                               | N        |
+| **verbose**          | True or False to enable verbosity to the wrapper script                                                                                                                                                                                                   | False if `RAPIDS_USER_TOOLS_LOG_DEBUG` is not set                                                                                                                                                                             | N        |
+| **rapids_options**** | A list of valid [Profiling tool options](../../core/docs/spark-profiling-tool.md#qualification-tool-options). Note that (`output-directory`, `auto-tuner`, `combined`) flags are ignored                                                                  | N/A                                                                                                                                                                                                                           | N        |
+
+If the CLI does not provide an argument `gpu_cluster`, then a valid path to yaml file must be
+provided through the arg `worker_info`.
+The `worker_info` is a yaml file that contains the HW description of the workers. It must contain
+the following properties:
+- `system.numCores`: number of cores of a single worker node
+- `system.memory`: RAM size in MiB of a single node
+- `system.numWorkers`: number of workers
+- `gpu.name`: the accelerator installed on the worker node
+- `gpu.memory`: memory size of the accelerator in MiB. (i.e., 16GB for Nvidia-T4)
+- `softwareProperties`: Spark default-configurations of the target cluster
+
+An example of valid `worker_info.yaml`:
+
+  ```
+  system:
+    numCores: 32
+    memory: 212992MiB
+    numWorkers: 5
+  gpu:
+    memory: 15109MiB
+    count: 4
+    name: T4
+  softwareProperties:
+    spark.driver.maxResultSize: 7680m
+    spark.driver.memory: 15360m
+    spark.executor.cores: '8'
+    spark.executor.instances: '2'
+    spark.executor.memory: 47222m
+    spark.executorEnv.OPENBLAS_NUM_THREADS: '1'
+    spark.scheduler.mode: FAIR
+    spark.sql.cbo.enabled: 'true'
+    spark.ui.port: '0'
+    spark.yarn.am.memory: 640m
+  ```
+
+#### Use case scenario
+
+A typical workflow to successfully run the `profiling` command in local mode is described as follows:
+
+1. Store the Apache Spark event logs in S3 folder.
+2. A user sets up his development machine:
+  1. configures Java
+  2. installs AWS CLI and configures the profile and the credentials to make sure the AWS CLI
+    commands can access the S3 resources `LOGS_BUCKET`.
+  3. installs `spark_rapids_user_tools`
+3. If the results of the wrapper need to be stored on S3, then another S3 uri is required `REMOTE_FOLDER=s3://OUT_BUCKET/`
+4. Depending on the accessibility of the cluster properties, the user chooses one of the 2 cases below (_"Case-A"_, and _"Case-B"_) to trigger the CLI.
+
+For each successful execution, the wrapper generates a new directory in the format of
+`prof_<YYYYmmddHHmmss>_<0x%08X>`. The directory contains `profiling_summary.log` in addition to
+the actual folder of the RAPIDS Profiling tool. The directory will be mirrored to S3 folder if the
+argument `--remote_folder` was a valid S3 path.
+
+   ```
+    ./prof_<YYYYmmddHHmmss>_<0x%08X>/profiling_summary.log
+    ./prof_<YYYYmmddHHmmss>_<0x%08X>/rapids_4_spark_profile/
+   ```
+
+**Case-A: A gpu-cluster property file is accessible:**
+
+A cluster property is still accessible if one of the following conditions applies:
+
+1. The cluster is listed by the `aws emr list-clusters` cmd. In this case, the CLI will be triggered by providing
+   `--gpu_cluster $CLUSTER_NAME`
+
+       ```
+       # run the command using the GPU cluster name
+       export RAPIDS_USER_TOOLS_CACHE_FOLDER=my_cache_folder
+       export EVENTLOGS=s3://LOGS_BUCKET/eventlogs/
+       export CLUSTER_NAME=my-emr-gpu-cluster
+       export REMOTE_FOLDER=s3://OUT_BUCKET/wrapper_output
+       
+       spark_rapids_user_tools emr profiling \
+          --eventlogs $EVENTLOGS \
+          --gpu_cluster $CLUSTER_NAME \
+          --remote_folder $REMOTE_FOLDER
+       ```
+2. The cluster properties file is accessible on local disk or a valid S3 path.
+
+   ```
+   $> export CLUSTER_PROPS_FILE=cluster-props.json
+   $> aws emr describe-cluster --cluster-id  $(aws emr list-clusters --query 'Clusters[?Name==$CLUSTER_NAME].Id' --output text) > $CLUSTER_PROPS_FILE
+   ```
+   Trigger the CLI by providing the path to the properties file `--gpu_cluster $CLUSTER_PROPS_FILE`
+
+   ```
+   $> spark_rapids_user_tools emr profiling \
+        --eventlogs $EVENTLOGS \
+        --gpu_cluster $CLUSTER_PROPS_FILE \
+        --remote_folder $REMOTE_FOLDER
+   ```
+
+**Case-B: GPU cluster information is missing:**
+
+In this scenario, users can write down a simple yaml file to describe the shape of the worker nodes.  
+This case is relevant to the following plans:
+1. Users who might want to experiment with different configurations before deciding on the final
+   cluster shape.
+2. Users who have no access to the properties of the cluster.
+
+The CLI is triggered by providing the location where the yaml file is stored `--worker_info $WORKER_INFO_PATH`
+
+    ```
+    # First, create a yaml file as described in previous section
+    $> export WORKER_INFO_PATH=worker-info.yaml
+    # Run the profiling cmd
+    $> spark_rapids_user_tools emr profiling \
+            --eventlogs $EVENTLOGS \
+            --worker_info $WORKER_INFO_PATH \
+            --remote_folder $REMOTE_FOLDER
+    ```
+
 ## Bootstrap command
 
 ```
diff --git a/user_tools/src/spark_rapids_pytools/__init__.py b/user_tools/src/spark_rapids_pytools/__init__.py
index f678a18d6..ebfe23cb0 100644
--- a/user_tools/src/spark_rapids_pytools/__init__.py
+++ b/user_tools/src/spark_rapids_pytools/__init__.py
@@ -16,5 +16,5 @@
 
 from spark_rapids_pytools.build import get_version
 
-VERSION = '23.06.4'
+VERSION = '23.08.0'
 __version__ = get_version(VERSION)
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py
index 082ff4738..c390b9d88 100644
--- a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py
+++ b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py
@@ -21,6 +21,7 @@
 from spark_rapids_pytools.rapids.diagnostic import Diagnostic
 from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, \
     QualGpuClusterReshapeType
+from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal
 
 
 class CliEmrLocalMode:  # pylint: disable=too-few-public-methods
@@ -119,6 +120,77 @@ def qualification(cpu_cluster: str = None,
                              wrapper_options=wrapper_qual_options,
                              rapids_options=rapids_options).launch()
 
+    @staticmethod
+    def profiling(gpu_cluster: str = None,
+                  worker_info: str = None,
+                  eventlogs: str = None,
+                  profile: str = None,
+                  local_folder: str = None,
+                  remote_folder: str = None,
+                  tools_jar: str = None,
+                  jvm_heap_size: int = 24,
+                  verbose: bool = False,
+                  **rapids_options) -> None:
+        """
+        The Profiling tool analyzes both CPU or GPU generated event logs and generates information
+        which can be used for debugging and profiling Apache Spark applications.
+
+        :param  gpu_cluster: The EMR-cluster on which the Spark applications were executed. The argument
+                can be an EMR-cluster or a valid path to the cluster's properties file (json format)
+                generated by the AWS CLI. If missing, then the argument worker_info has to be provided.
+        :param  worker_info: A path pointing to a yaml file containing the system information of a
+                worker node. It is assumed that all workers are homogenous.
+                If missing, the wrapper pulls the worker info from the "gpu_cluster".
+        :param  eventlogs: Event log filenames or S3 storage directories
+                containing event logs (comma separated). If missing, the wrapper Reads the Spark's
+                property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included
+                in the output of `aws emr describe-cluster`. Note that the wrapper will raise an exception
+                if the property is not set.
+        :param profile: A named AWS profile to get the settings/credentials of the AWS account.
+        :param local_folder: Local work-directory path to store the output and to be used as root
+                directory for temporary folders/files. The final output will go into a subdirectory called
+                ${local_folder}/prof-${EXEC_ID} where exec_id is an auto-generated unique identifier of the
+                execution. If the argument is NONE, the default value is the env variable
+                RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY if any; or the current working directory.
+        :param remote_folder: A S3 folder where the output is uploaded at the end of execution.
+                If no value is provided, the output will be only available on local disk.
+        :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
+                or remote S3 url. If missing, the wrapper downloads the latest rapids-4-spark-tools_*.jar
+                from maven repo.
+        :param verbose: True or False to enable verbosity to the wrapper script.
+        :param jvm_heap_size: The maximum heap size of the JVM in gigabytes.
+        :param rapids_options: A list of valid Profiling tool options.
+                Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support
+                multiple "spark-property" arguments.
+                For more details on Profiling tool options, please visit
+                https://nvidia.github.io/spark-rapids/docs/spark-profiling-tool.html#profiling-tool-options
+        """
+        if verbose:
+            # when debug is set to true set it in the environment.
+            ToolLogging.enable_debug_mode()
+        wrapper_prof_options = {
+            'platformOpts': {
+                'profile': profile,
+                'deployMode': DeployMode.LOCAL,
+            },
+            'migrationClustersProps': {
+                'gpuCluster': gpu_cluster
+            },
+            'jobSubmissionProps': {
+                'remoteFolder': remote_folder,
+                'platformArgs': {
+                    'jvmMaxHeapSize': jvm_heap_size
+                }
+            },
+            'eventlogs': eventlogs,
+            'toolsJar': tools_jar,
+            'autoTunerFileInput': worker_info
+        }
+        ProfilingAsLocal(platform_type=CloudPlatform.EMR,
+                         output_folder=local_folder,
+                         wrapper_options=wrapper_prof_options,
+                         rapids_options=rapids_options).launch()
+
     @staticmethod
     def bootstrap(cluster: str,
                   profile: str = None,
@@ -211,5 +283,6 @@ class EMRWrapper:  # pylint: disable=too-few-public-methods
 
     def __init__(self):
         self.qualification = CliEmrLocalMode.qualification
+        self.profiling = CliEmrLocalMode.profiling
         self.bootstrap = CliEmrLocalMode.bootstrap
         self.diagnostic = CliEmrLocalMode.diagnostic
diff --git a/user_tools/tox.ini b/user_tools/tox.ini
index 0aa43cd25..9e8f6b87c 100644
--- a/user_tools/tox.ini
+++ b/user_tools/tox.ini
@@ -11,6 +11,12 @@ envlist =
     flake8
 isolated_build = True
 
+[gh-actions]
+python =
+    3.8: python3.8, pylint, flake8
+    3.9: python3.9, pylint, flake8
+    3.10: python3.10, pylint, flake8
+
 [testenv]
 deps =
     pytest-cov