Release 1.4.0 (#372)

* Add Delta support (#370)
quintoandar · Aug 22, 2024 · 8a15b10 · 8a15b10
1 parent dd8cefe
commit 8a15b10
Show file tree

Hide file tree

Showing 22 changed files with 503 additions and 141 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -4,11 +4,9 @@ on:
     paths:
       - 'setup.py'
 
-
 jobs:
   Pipeline:
     if: github.ref == 'refs/heads/master'
-
     runs-on: ubuntu-latest
 
     steps:
@@ -19,7 +17,7 @@ jobs:
 
     - uses: actions/setup-java@v4
       with:
-        java-version: '11'
+        java-version: '17'
         distribution: microsoft
 
     - uses: vemonet/setup-spark@v1

diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml
@@ -7,7 +7,6 @@ on:
 jobs:
   Pipeline:
     if: github.ref == 'refs/heads/staging'
-
     runs-on: ubuntu-latest
 
     steps:
@@ -18,7 +17,7 @@ jobs:
 
       - uses: actions/setup-java@v4
         with:
-          java-version: '11'
+          java-version: '17'
           distribution: microsoft
 
       - uses: vemonet/setup-spark@v1

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -19,7 +19,7 @@ jobs:
 
     - uses: actions/setup-java@v4
       with:
-        java-version: '11'
+        java-version: '17'
         distribution: microsoft
 
     - uses: vemonet/setup-spark@v1

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each
 
 ## [Unreleased]
 
+## [1.4.0](https://github.com/quintoandar/butterfree/releases/tag/1.4.0)
+* Add Delta support ([#370](https://github.com/quintoandar/butterfree/pull/370))
+
 ## [1.3.5](https://github.com/quintoandar/butterfree/releases/tag/1.3.5)
 * Auto create feature sets ([#368](https://github.com/quintoandar/butterfree/pull/368))
 

diff --git a/Makefile b/Makefile
@@ -36,7 +36,7 @@ minimum-requirements:
 
 .PHONY: requirements
 ## install all requirements
-requirements: requirements-test requirements-lint dev-requirements minimum-requirements
+requirements: minimum-requirements dev-requirements requirements-test requirements-lint
 
 .PHONY: ci-install
 ci-install:

diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py
@@ -30,6 +30,7 @@ def conn(self) -> SparkSession:
         """
         if not self._session:
             self._session = SparkSession.builder.getOrCreate()
+
         return self._session
 
     def read(

diff --git a/butterfree/load/writers/__init__.py b/butterfree/load/writers/__init__.py
@@ -1,8 +1,9 @@
 """Holds data loaders for historical and online feature store."""
 
+from butterfree.load.writers.delta_writer import DeltaWriter
 from butterfree.load.writers.historical_feature_store_writer import (
     HistoricalFeatureStoreWriter,
 )
 from butterfree.load.writers.online_feature_store_writer import OnlineFeatureStoreWriter
 
-__all__ = ["HistoricalFeatureStoreWriter", "OnlineFeatureStoreWriter"]
+__all__ = ["HistoricalFeatureStoreWriter", "OnlineFeatureStoreWriter", "DeltaWriter"]
diff --git a/butterfree/load/writers/delta_writer.py b/butterfree/load/writers/delta_writer.py
@@ -0,0 +1,162 @@
+from delta.tables import DeltaTable
+from pyspark.sql.dataframe import DataFrame
+
+from butterfree.clients import SparkClient
+from butterfree.configs.logger import __logger
+
+logger = __logger("delta_writer", True)
+
+
+class DeltaWriter:
+    """Control operations on Delta Tables.
+
+    Resposible for merging and optimizing.
+    """
+
+    @staticmethod
+    def _get_full_table_name(table, database):
+        if database:
+            return "{}.{}".format(database, table)
+        else:
+            return table
+
+    @staticmethod
+    def _convert_to_delta(client: SparkClient, table: str):
+        logger.info(f"Converting {table} to Delta...")
+        client.conn.sql(f"CONVERT TO DELTA {table}")
+        logger.info("Conversion done.")
+
+    @staticmethod
+    def merge(
+        client: SparkClient,
+        database: str,
+        table: str,
+        merge_on: list,
+        source_df: DataFrame,
+        when_not_matched_insert_condition: str = None,
+        when_matched_update_condition: str = None,
+        when_matched_delete_condition: str = None,
+    ):
+        """
+        Merge a source dataframe to a Delta table.
+
+        By default, it will update when matched, and insert when
+        not matched (simple upsert).
+
+        You can change this behavior by setting:
+        - when_not_matched_insert_condition: it will only insert
+            when this specified condition is true
+        - when_matched_update_condition: it will only update when this
+            specified condition is true. You can refer to the columns
+        in the source dataframe as source.<column_name>, and the columns
+            in the target table as target.<column_name>.
+        - when_matched_delete_condition: it will add an operation to delete,
+            but only if this condition is true. Again, source and
+            target dataframe columns can be referred to respectively as
+            source.<column_name> and target.<column_name>
+        """
+        try:
+            full_table_name = DeltaWriter._get_full_table_name(table, database)
+
+            table_exists = client.conn.catalog.tableExists(full_table_name)
+
+            if table_exists:
+                pd_df = client.conn.sql(
+                    f"DESCRIBE TABLE EXTENDED {full_table_name}"
+                ).toPandas()
+                provider = (
+                    pd_df.reset_index()
+                    .groupby(["col_name"])["data_type"]
+                    .aggregate("first")
+                    .Provider
+                )
+                table_is_delta = provider.lower() == "delta"
+
+                if not table_is_delta:
+                    DeltaWriter()._convert_to_delta(client, full_table_name)
+
+            # For schema evolution
+            client.conn.conf.set(
+                "spark.databricks.delta.schema.autoMerge.enabled", "true"
+            )
+
+            target_table = DeltaTable.forName(client.conn, full_table_name)
+            join_condition = " AND ".join(
+                [f"source.{col} = target.{col}" for col in merge_on]
+            )
+            merge_builder = target_table.alias("target").merge(
+                source_df.alias("source"), join_condition
+            )
+            if when_matched_delete_condition:
+                merge_builder = merge_builder.whenMatchedDelete(
+                    condition=when_matched_delete_condition
+                )
+
+            merge_builder.whenMatchedUpdateAll(
+                condition=when_matched_update_condition
+            ).whenNotMatchedInsertAll(
+                condition=when_not_matched_insert_condition
+            ).execute()
+        except Exception as e:
+            logger.error(f"Merge operation on {full_table_name} failed: {e}")
+
+    @staticmethod
+    def vacuum(table: str, retention_hours: int, client: SparkClient):
+        """Vacuum a Delta table.
+
+        Vacuum remove unused files (files not managed by Delta + files
+        that are not in the latest state).
+        After vacuum it's impossible to time travel to versions
+        older than the `retention` time.
+        Default retention is 7 days. Lower retentions will be warned,
+        unless it's set to false.
+        Set spark.databricks.delta.retentionDurationCheck.enabled
+        to false for low retentions.
+        https://docs.databricks.com/en/sql/language-manual/delta-vacuum.html
+        """
+
+        command = f"VACUUM {table} RETAIN {retention_hours} HOURS"
+        logger.info(f"Running vacuum with command {command}")
+        client.conn.sql(command)
+        logger.info(f"Vacuum successful for table {table}")
+
+    @staticmethod
+    def optimize(
+        client: SparkClient,
+        table: str = None,
+        z_order: list = None,
+        date_column: str = "timestamp",
+        from_date: str = None,
+        auto_compact: bool = False,
+        optimize_write: bool = False,
+    ):
+        """Optimize a Delta table.
+
+        For auto-compaction and optimize write DBR >= 14.3 LTS
+        and Delta >= 3.1.0 are MANDATORY.
+        For z-ordering DBR >= 13.3 LTS and Delta >= 2.0.0 are MANDATORY.
+        Auto-compaction (recommended) reduces the small file problem
+        (overhead due to lots of metadata).
+        Z-order by columns that is commonly used in queries
+        predicates and has a high cardinality.
+        https://docs.delta.io/latest/optimizations-oss.html
+        """
+
+        if auto_compact:
+            client.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
+
+        if optimize_write:
+            client.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
+
+        if table:
+            command = f"OPTIMIZE {table}"
+
+            if from_date:
+                command += f"WHERE {date_column} >= {from_date}"
+
+            if z_order:
+                command += f" ZORDER BY {','.join(z_order)}"
+
+            logger.info(f"Running optimize with command {command}...")
+            client.conn.sql(command)
+            logger.info(f"Optimize successful for table {table}.")
diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py
@@ -14,6 +14,7 @@
 from butterfree.dataframe_service import repartition_df
 from butterfree.hooks import Hook
 from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook
+from butterfree.load.writers.delta_writer import DeltaWriter
 from butterfree.load.writers.writer import Writer
 from butterfree.transform import FeatureSet
 
@@ -92,6 +93,15 @@ class HistoricalFeatureStoreWriter(Writer):
         improve queries performance. The data is stored in partition folders in AWS S3
         based on time (per year, month and day).
 
+        >>> spark_client = SparkClient()
+        >>> writer = HistoricalFeatureStoreWriter()
+        >>> writer.write(feature_set=feature_set,
+        ...           dataframe=dataframe,
+        ...           spark_client=spark_client
+        ...           merge_on=["id", "timestamp"])
+
+        This procedure will skip dataframe write and will activate Delta Merge.
+        Use it when the table already exist.
     """
 
     PARTITION_BY = [
@@ -114,13 +124,15 @@ def __init__(
         interval_mode: bool = False,
         check_schema_hook: Optional[Hook] = None,
         row_count_validation: bool = True,
+        merge_on: list = None,
     ):
         super(HistoricalFeatureStoreWriter, self).__init__(
             db_config or MetastoreConfig(),
             debug_mode,
             interval_mode,
             False,
             row_count_validation,
+            merge_on,
         )
         self.database = database or environment.get_variable(
             "FEATURE_STORE_HISTORICAL_DATABASE"
@@ -141,6 +153,7 @@ def write(
             feature_set: object processed with feature_set informations.
             dataframe: spark dataframe containing data from a feature set.
             spark_client: client for spark connections with external services.
+            merge_on: when filled, the writing is an upsert in a Delta table.
 
         If the debug_mode is set to True, a temporary table with a name in the format:
         historical_feature_store__{feature_set.name} will be created instead of writing
@@ -174,13 +187,22 @@ def write(
 
         s3_key = os.path.join("historical", feature_set.entity, feature_set.name)
 
-        spark_client.write_table(
-            dataframe=dataframe,
-            database=self.database,
-            table_name=feature_set.name,
-            partition_by=self.PARTITION_BY,
-            **self.db_config.get_options(s3_key),
-        )
+        if self.merge_on:
+            DeltaWriter.merge(
+                client=spark_client,
+                database=self.database,
+                table=feature_set.name,
+                merge_on=self.merge_on,
+                source_df=dataframe,
+            )
+        else:
+            spark_client.write_table(
+                dataframe=dataframe,
+                database=self.database,
+                table_name=feature_set.name,
+                partition_by=self.PARTITION_BY,
+                **self.db_config.get_options(s3_key),
+            )
 
     def _assert_validation_count(
         self, table_name: str, written_count: int, dataframe_count: int

diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py
@@ -27,6 +27,7 @@ def __init__(
         interval_mode: Optional[bool] = False,
         write_to_entity: Optional[bool] = False,
         row_count_validation: Optional[bool] = True,
+        merge_on: Optional[list] = None,
     ) -> None:
         super().__init__()
         self.db_config = db_config
@@ -35,6 +36,7 @@ def __init__(
         self.interval_mode = interval_mode
         self.write_to_entity = write_to_entity
         self.row_count_validation = row_count_validation
+        self.merge_on = merge_on
 
     def with_(
         self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any

diff --git a/docs/source/butterfree.automated.rst b/docs/source/butterfree.automated.rst
@@ -4,6 +4,8 @@ butterfree.automated package
 Submodules
 ----------
 
+butterfree.automated.feature\_set\_creation module
+--------------------------------------------------
 
 .. automodule:: butterfree.automated.feature_set_creation
    :members:

diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst
@@ -54,7 +54,6 @@ butterfree.constants.spark\_constants module
    :undoc-members:
    :show-inheritance:
 
-
 .. automodule:: butterfree.constants.spark_constants
    :members:
    :undoc-members:

diff --git a/docs/source/butterfree.dataframe_service.rst b/docs/source/butterfree.dataframe_service.rst
@@ -4,18 +4,29 @@ butterfree.dataframe\_service package
 Submodules
 ----------
 
+butterfree.dataframe\_service.incremental\_strategy module
+----------------------------------------------------------
 
 .. automodule:: butterfree.dataframe_service.incremental_strategy
    :members:
    :undoc-members:
    :show-inheritance:
 
+butterfree.dataframe\_service.partitioning module
+-------------------------------------------------
 
 .. automodule:: butterfree.dataframe_service.partitioning
    :members:
    :undoc-members:
    :show-inheritance:
 
+butterfree.dataframe\_service.repartition module
+------------------------------------------------
+
+.. automodule:: butterfree.dataframe_service.repartition
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
 .. automodule:: butterfree.dataframe_service.repartition
    :members: