kbase · Tianhao-Gu · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/src/spark/utils.py b/src/spark/utils.py
@@ -10,6 +10,9 @@
 HADOOP_AWS_VER = os.getenv('HADOOP_AWS_VER')
 DELTA_SPARK_VER = os.getenv('DELTA_SPARK_VER')
 SCALA_VER = os.getenv('SCALA_VER')
+# the default number of CPU cores that each Spark executor will use
+# If not specified, Spark will typically use all available cores on the worker nodes
+DEFAULT_EXECUTOR_CORES = 1
 
 
 def _get_jars(jar_names: list) -> str:
@@ -29,11 +32,14 @@
     return ", ".join(jars)
 
 
-def _get_delta_lake_conf(jars_str: str) -> dict:
+def _get_delta_lake_conf(
+        jars_str: str,
+        executor_cores: int) -> dict:
     """
     Helper function to get Delta Lake specific Spark configuration.
 
     :param jars_str: A comma-separated string of JAR file paths
+    :param executor_cores: The number of CPU cores that each Spark executor will use
 
     :return: A dictionary of Delta Lake specific Spark configuration
 
@@ -50,6 +56,7 @@
         "spark.hadoop.fs.s3a.path.style.access": "true",
         "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
         "spark.sql.catalogImplementation": "hive",
+        "spark.executor.cores": executor_cores,
     }
 
 
@@ -76,14 +83,16 @@
         app_name: str = None,
         local: bool = False,
         delta_lake: bool = True,
-        timeout_sec: int = 4 * 60 * 60) -> SparkSession:
+        timeout_sec: int = 4 * 60 * 60,
+        executor_cores: int = DEFAULT_EXECUTOR_CORES) -> SparkSession:
     """
     Helper to get and manage the SparkSession and keep all of our spark configuration params in one place.
 
     :param app_name: The name of the application. If not provided, a default name will be generated.
     :param local: Whether to run the spark session locally or not. Default is False.
     :param delta_lake: Build the spark session with Delta Lake support. Default is True.
     :param timeout_sec: The timeout in seconds to stop the Spark session forcefully. Default is 4 hours.
+    :param executor_cores: The number of CPU cores that each Spark executor will use. Default is 1.
 
     :return: A SparkSession object
     """
@@ -101,7 +110,7 @@
         jar_names = [f"delta-spark_{SCALA_VER}-{DELTA_SPARK_VER}.jar",
                      f"hadoop-aws-{HADOOP_AWS_VER}.jar"]
         jars_str = _get_jars(jar_names)
-        delta_conf = _get_delta_lake_conf(jars_str)
+        delta_conf = _get_delta_lake_conf(jars_str, executor_cores)
         for key, value in delta_conf.items():
             spark_conf.set(key, value)