From 8e5f17b131e24d735633c279bac2354b5ac7693e Mon Sep 17 00:00:00 2001
From: richenc <125420929+richenc@users.noreply.github.com>
Date: Wed, 15 May 2024 08:23:49 -0700
Subject: [PATCH 01/11] feat(ingest/tableau): support platform instance mapping
 based off database server hostname (#10254)

Co-authored-by: Richie Chen <richie.chen@hulu.com>
Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com>
---
 .../src/datahub/ingestion/source/tableau.py   | 42 ++++++++++++++++++-
 .../ingestion/source/tableau_common.py        | 37 ++++++++++++++++
 .../ingestion/source/tableau_constant.py      |  2 +
 .../tableau/test_tableau_ingest.py            | 37 ++++++++++++++++
 4 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index 6d04cc0e2d1f5..e0b442387d3b6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -15,6 +15,7 @@
     Union,
     cast,
 )
+from urllib.parse import urlparse
 
 import dateutil.parser as dp
 import tableauserverclient as TSC
@@ -86,6 +87,7 @@
     clean_query,
     custom_sql_graphql_query,
     dashboard_graphql_query,
+    database_servers_graphql_query,
     database_tables_graphql_query,
     embedded_datasource_graphql_query,
     get_filter_pages,
@@ -345,6 +347,11 @@ class TableauConfig(
         description="Mappings to change generated dataset urns. Use only if you really know what you are doing.",
     )
 
+    database_hostname_to_platform_instance_map: Optional[Dict[str, str]] = Field(
+        default=None,
+        description="Mappings to change platform instance in generated dataset urns based on database. Use only if you really know what you are doing.",
+    )
+
     extract_usage_stats: bool = Field(
         default=False,
         description="[experimental] Extract usage statistics for dashboards and charts.",
@@ -537,6 +544,8 @@ def __init__(
         self.workbook_project_map: Dict[str, str] = {}
         self.datasource_project_map: Dict[str, str] = {}
 
+        # This map keeps track of the database server connection hostnames.
+        self.database_server_hostname_map: Dict[str, str] = {}
         # This list keeps track of sheets in workbooks so that we retrieve those
         # when emitting sheets.
         self.sheet_ids: List[str] = []
@@ -609,6 +618,24 @@ def _populate_usage_stat_registry(self) -> None:
             self.tableau_stat_registry[view.id] = UsageStat(view_count=view.total_views)
         logger.debug("Tableau stats %s", self.tableau_stat_registry)
 
+    def _populate_database_server_hostname_map(self) -> None:
+        def maybe_parse_hostname():
+            # If the connection string is a URL instead of a hostname, parse it
+            # and extract the hostname, otherwise just return the connection string.
+            parsed_host_name = urlparse(server_connection).hostname
+            if parsed_host_name:
+                return parsed_host_name
+            return server_connection
+
+        for database_server in self.get_connection_objects(
+            database_servers_graphql_query, c.DATABASE_SERVERS_CONNECTION
+        ):
+            database_server_id = database_server.get(c.ID)
+            server_connection = database_server.get(c.HOST_NAME)
+            host_name = maybe_parse_hostname()
+            if host_name:
+                self.database_server_hostname_map[str(database_server_id)] = host_name
+
     def _get_all_project(self) -> Dict[str, TableauProject]:
         all_project_map: Dict[str, TableauProject] = {}
 
@@ -864,7 +891,7 @@ def get_connection_objects(
         self,
         query: str,
         connection_type: str,
-        query_filter: dict,
+        query_filter: dict = {},
         page_size_override: Optional[int] = None,
     ) -> Iterable[dict]:
         # Calls the get_connection_object_page function to get the objects,
@@ -1142,6 +1169,8 @@ def get_upstream_tables(
                 self.config.env,
                 self.config.platform_instance_map,
                 self.config.lineage_overrides,
+                self.config.database_hostname_to_platform_instance_map,
+                self.database_server_hostname_map,
             )
             table_id_to_urn[table[c.ID]] = table_urn
 
@@ -1708,8 +1737,11 @@ def parse_custom_sql(
                 [
                     str,
                     Optional[str],
+                    Optional[str],
                     Optional[Dict[str, str]],
                     Optional[TableauLineageOverrides],
+                    Optional[Dict[str, str]],
+                    Optional[Dict[str, str]],
                 ],
                 Tuple[Optional[str], Optional[str], str, str],
             ]
@@ -1753,8 +1785,11 @@ def parse_custom_sql(
             upstream_db, platform_instance, platform, _ = func_overridden_info(
                 database_info[c.CONNECTION_TYPE],
                 database_info.get(c.NAME),
+                database_info.get(c.ID),
                 self.config.platform_instance_map,
                 self.config.lineage_overrides,
+                self.config.database_hostname_to_platform_instance_map,
+                self.database_server_hostname_map,
             )
 
         logger.debug(
@@ -2759,6 +2794,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
             if self.config.extract_usage_stats:
                 self._populate_usage_stat_registry()
 
+            # Populate the map of database names and database hostnames to be used later to map
+            # databases to platform instances.
+            if self.config.database_hostname_to_platform_instance_map:
+                self._populate_database_server_hostname_map()
+
             self._populate_projects_registry()
             yield from self.emit_project_containers()
             yield from self.emit_workbooks()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
index fcfa434e00fee..6c75876e68787 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
@@ -206,6 +206,7 @@ class MetadataQueryException(Exception):
         name
         database {
             name
+            id
         }
         schema
         fullName
@@ -290,6 +291,7 @@ class MetadataQueryException(Exception):
               name
               database {
                 name
+                id
               }
               schema
               fullName
@@ -315,6 +317,7 @@ class MetadataQueryException(Exception):
         name
         database {
           name
+          id
         }
         schema
         fullName
@@ -327,6 +330,7 @@ class MetadataQueryException(Exception):
       connectionType
       database{
         name
+        id
         connectionType
       }
 }
@@ -347,6 +351,7 @@ class MetadataQueryException(Exception):
       name
       database {
         name
+        id
       }
       schema
       fullName
@@ -418,6 +423,16 @@ class MetadataQueryException(Exception):
 }
 """
 
+database_servers_graphql_query = """
+{
+    name
+    id
+    connectionType
+    extendedConnectionType
+    hostName
+}
+"""
+
 # https://referencesource.microsoft.com/#system.data/System/Data/OleDb/OLEDB_Enum.cs,364
 FIELD_TYPE_MAPPING = {
     "INTEGER": NumberTypeClass,
@@ -592,6 +607,7 @@ def get_fully_qualified_table_name(
 @dataclass
 class TableauUpstreamReference:
     database: Optional[str]
+    database_id: Optional[str]
     schema: Optional[str]
     table: str
 
@@ -603,6 +619,7 @@ def create(
     ) -> "TableauUpstreamReference":
         # Values directly from `table` object from Tableau
         database = t_database = d.get(c.DATABASE, {}).get(c.NAME)
+        database_id = d.get(c.DATABASE, {}).get(c.ID)
         schema = t_schema = d.get(c.SCHEMA)
         table = t_table = d.get(c.NAME) or ""
         t_full_name = d.get(c.FULL_NAME)
@@ -654,6 +671,7 @@ def create(
 
         return cls(
             database=database,
+            database_id=database_id,
             schema=schema,
             table=table,
             connection_type=t_connection_type,
@@ -679,6 +697,8 @@ def make_dataset_urn(
         env: str,
         platform_instance_map: Optional[Dict[str, str]],
         lineage_overrides: Optional[TableauLineageOverrides] = None,
+        database_hostname_to_platform_instance_map: Optional[Dict[str, str]] = None,
+        database_server_hostname_map: Optional[Dict[str, str]] = None,
     ) -> str:
         (
             upstream_db,
@@ -688,8 +708,11 @@ def make_dataset_urn(
         ) = get_overridden_info(
             connection_type=self.connection_type,
             upstream_db=self.database,
+            upstream_db_id=self.database_id,
             lineage_overrides=lineage_overrides,
             platform_instance_map=platform_instance_map,
+            database_hostname_to_platform_instance_map=database_hostname_to_platform_instance_map,
+            database_server_hostname_map=database_server_hostname_map,
         )
 
         table_name = get_fully_qualified_table_name(
@@ -707,8 +730,11 @@ def make_dataset_urn(
 def get_overridden_info(
     connection_type: Optional[str],
     upstream_db: Optional[str],
+    upstream_db_id: Optional[str],
     platform_instance_map: Optional[Dict[str, str]],
     lineage_overrides: Optional[TableauLineageOverrides] = None,
+    database_hostname_to_platform_instance_map: Optional[Dict[str, str]] = None,
+    database_server_hostname_map: Optional[Dict[str, str]] = None,
 ) -> Tuple[Optional[str], Optional[str], str, str]:
     original_platform = platform = get_platform(connection_type)
     if (
@@ -729,6 +755,17 @@ def get_overridden_info(
     platform_instance = (
         platform_instance_map.get(original_platform) if platform_instance_map else None
     )
+    if (
+        database_server_hostname_map is not None
+        and upstream_db_id is not None
+        and upstream_db_id in database_server_hostname_map
+    ):
+        hostname = database_server_hostname_map.get(upstream_db_id)
+        if (
+            database_hostname_to_platform_instance_map is not None
+            and hostname in database_hostname_to_platform_instance_map
+        ):
+            platform_instance = database_hostname_to_platform_instance_map.get(hostname)
 
     if original_platform in ("athena", "hive", "mysql"):  # Two tier databases
         upstream_db = None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_constant.py
index e80c9d8fd1f25..9ead9a407d957 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau_constant.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_constant.py
@@ -23,6 +23,8 @@
 CUSTOM_SQL_TABLE = "CustomSQLTable"
 UPSTREAM_TABLES = "upstreamTables"
 DATABASE_TABLES_CONNECTION = "databaseTablesConnection"
+DATABASE_SERVERS_CONNECTION = "databaseServersConnection"
+HOST_NAME = "hostName"
 FIELDS = "fields"
 UPSTREAM_DATA_SOURCES = "upstreamDatasources"
 COLUMNS = "columns"
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index 36e7af700589c..57fcb0b6ee49a 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -572,6 +572,7 @@ def test_lineage_overrides():
     assert (
         TableauUpstreamReference(
             "presto_catalog",
+            "test-database-id",
             "test-schema",
             "test-table",
             "presto",
@@ -586,6 +587,7 @@ def test_lineage_overrides():
     assert (
         TableauUpstreamReference(
             "presto_catalog",
+            "test-database-id",
             "test-schema",
             "test-table",
             "presto",
@@ -602,6 +604,7 @@ def test_lineage_overrides():
     # transform hive urn to presto urn
     assert (
         TableauUpstreamReference(
+            None,
             None,
             "test-schema",
             "test-table",
@@ -617,6 +620,40 @@ def test_lineage_overrides():
     )
 
 
+def test_database_hostname_to_platform_instance_map():
+    enable_logging()
+    # Simple - snowflake table
+    assert (
+        TableauUpstreamReference(
+            "test-database-name",
+            "test-database-id",
+            "test-schema",
+            "test-table",
+            "snowflake",
+        ).make_dataset_urn(env=DEFAULT_ENV, platform_instance_map={})
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,test-database-name.test-schema.test-table,PROD)"
+    )
+
+    # Finding platform instance based off hostname to platform instance mappings
+    assert (
+        TableauUpstreamReference(
+            "test-database-name",
+            "test-database-id",
+            "test-schema",
+            "test-table",
+            "snowflake",
+        ).make_dataset_urn(
+            env=DEFAULT_ENV,
+            platform_instance_map={},
+            database_hostname_to_platform_instance_map={
+                "test-hostname": "test-platform-instance"
+            },
+            database_server_hostname_map={"test-database-id": "test-hostname"},
+        )
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,test-platform-instance.test-database-name.test-schema.test-table,PROD)"
+    )
+
+
 @freeze_time(FROZEN_TIME)
 def test_tableau_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
     output_file_name: str = "tableau_mces.json"

From c55c12c91820c000912ff0789239b74f6f28684c Mon Sep 17 00:00:00 2001
From: sid-acryl <155424659+sid-acryl@users.noreply.github.com>
Date: Wed, 15 May 2024 23:55:07 +0530
Subject: [PATCH 02/11] fix(ingestion/looker): deduplicate the view field
 (#10482)

Co-authored-by: Pedro Silva <pedro@acryl.io>
---
 .../ingestion/source/looker/lookml_source.py  |  30 ++
 .../duplicate_field_ingestion_golden.json     | 488 ++++++++++++++++++
 .../data.model.lkml                           |   7 +
 .../dataset_lineages.view.lkml                |  50 ++
 .../tests/integration/lookml/test_lookml.py   |  23 +
 5 files changed, 598 insertions(+)
 create mode 100644 metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json
 create mode 100644 metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/data.model.lkml
 create mode 100644 metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/dataset_lineages.view.lkml

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
index d1fddfdd8bc4a..9dd276d054de3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
@@ -112,6 +112,34 @@
 _MODEL_FILE_EXTENSION = ".model.lkml"
 
 
+def deduplicate_fields(fields: List[ViewField]) -> List[ViewField]:
+    # Remove duplicates filed from self.fields
+    # Logic is: If more than a field has same ViewField.name then keep only one filed where ViewField.field_type
+    # is DIMENSION_GROUP.
+    # Looker Constraint:
+    #   - Any field declared as dimension or measure can be redefined as dimension_group.
+    #   - Any field declared in dimension can't be redefined in measure and vice-versa.
+
+    dimension_group_field_names: List[str] = [
+        field.name
+        for field in fields
+        if field.field_type == ViewFieldType.DIMENSION_GROUP
+    ]
+
+    new_fields: List[ViewField] = []
+
+    for field in fields:
+        if (
+            field.name in dimension_group_field_names
+            and field.field_type != ViewFieldType.DIMENSION_GROUP
+        ):
+            continue
+
+        new_fields.append(field)
+
+    return new_fields
+
+
 def _get_bigquery_definition(
     looker_connection: DBConnection,
 ) -> Tuple[str, Optional[str], Optional[str]]:
@@ -1155,6 +1183,8 @@ def from_looker_dict(
         )
         fields: List[ViewField] = dimensions + dimension_groups + measures
 
+        fields = deduplicate_fields(fields)
+
         # Prep "default" values for the view, which will be overridden by the logic below.
         view_logic = looker_viewfile.raw_file_content[:max_file_snippet_length]
         sql_table_names: List[str] = []
diff --git a/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json
new file mode 100644
index 0000000000000..b06b59ba43654
--- /dev/null
+++ b/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json
@@ -0,0 +1,488 @@
+[
+{
+    "entityType": "container",
+    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
+    "changeType": "UPSERT",
+    "aspectName": "containerProperties",
+    "aspect": {
+        "json": {
+            "customProperties": {
+                "platform": "looker",
+                "env": "PROD",
+                "project_name": "lkml_samples"
+            },
+            "name": "lkml_samples"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "container",
+    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "container",
+    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
+    "changeType": "UPSERT",
+    "aspectName": "dataPlatformInstance",
+    "aspect": {
+        "json": {
+            "platform": "urn:li:dataPlatform:looker"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "container",
+    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
+    "changeType": "UPSERT",
+    "aspectName": "subTypes",
+    "aspect": {
+        "json": {
+            "typeNames": [
+                "LookML Project"
+            ]
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "container",
+    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
+    "changeType": "UPSERT",
+    "aspectName": "browsePathsV2",
+    "aspect": {
+        "json": {
+            "path": [
+                {
+                    "id": "Folders"
+                }
+            ]
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "subTypes",
+    "aspect": {
+        "json": {
+            "typeNames": [
+                "View"
+            ]
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "viewProperties",
+    "aspect": {
+        "json": {
+            "materialized": false,
+            "viewLogic": "# File was added to check duplicate field issue\n\nview: dataset_lineages {\n  sql_table_name: \"PUBLIC\".\"DATASET_LINEAGES\"\n    ;;\n\n  dimension: createdon {\n    type:  date\n    sql: ${TABLE}.\"CREATEDON\" ;;\n  }\n\n  dimension_group: createdon {\n    type: time\n    timeframes: [\n      raw,\n      time,\n      date,\n      week,\n      month,\n      quarter,\n      year\n    ]\n    sql: ${TABLE}.\"CREATEDON\" ;;\n  }\n\n  dimension: entity {\n    type: string\n    sql: ${TABLE}.\"ENTITY\" ;;\n  }\n\n  dimension: metadata {\n    type: string\n    sql: ${TABLE}.\"METADATA\" ;;\n  }\n\n  dimension: urn {\n    type: string\n    sql: ${TABLE}.\"URN\" ;;\n  }\n\n  dimension: version {\n    type: number\n    sql: ${TABLE}.\"VERSION\" ;;\n  }\n\n  measure: count {\n    type: count\n    drill_fields: []\n  }\n}\n",
+            "viewLanguage": "lookml"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "container",
+    "aspect": {
+        "json": {
+            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "proposedSnapshot": {
+        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
+            "aspects": [
+                {
+                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
+                        "paths": [
+                            "/Develop/lkml_samples/"
+                        ]
+                    }
+                },
+                {
+                    "com.linkedin.pegasus2avro.common.Status": {
+                        "removed": false
+                    }
+                },
+                {
+                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
+                        "upstreams": [
+                            {
+                                "auditStamp": {
+                                    "time": 1586847600000,
+                                    "actor": "urn:li:corpuser:datahub"
+                                },
+                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD)",
+                                "type": "VIEW"
+                            }
+                        ],
+                        "fineGrainedLineages": [
+                            {
+                                "upstreamType": "FIELD_SET",
+                                "upstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),entity)"
+                                ],
+                                "downstreamType": "FIELD",
+                                "downstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),entity)"
+                                ],
+                                "confidenceScore": 1.0
+                            },
+                            {
+                                "upstreamType": "FIELD_SET",
+                                "upstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),metadata)"
+                                ],
+                                "downstreamType": "FIELD",
+                                "downstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),metadata)"
+                                ],
+                                "confidenceScore": 1.0
+                            },
+                            {
+                                "upstreamType": "FIELD_SET",
+                                "upstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),urn)"
+                                ],
+                                "downstreamType": "FIELD",
+                                "downstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),urn)"
+                                ],
+                                "confidenceScore": 1.0
+                            },
+                            {
+                                "upstreamType": "FIELD_SET",
+                                "upstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),version)"
+                                ],
+                                "downstreamType": "FIELD",
+                                "downstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),version)"
+                                ],
+                                "confidenceScore": 1.0
+                            },
+                            {
+                                "upstreamType": "FIELD_SET",
+                                "upstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),createdon)"
+                                ],
+                                "downstreamType": "FIELD",
+                                "downstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),createdon)"
+                                ],
+                                "confidenceScore": 1.0
+                            },
+                            {
+                                "upstreamType": "FIELD_SET",
+                                "upstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),count)"
+                                ],
+                                "downstreamType": "FIELD",
+                                "downstreams": [
+                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),count)"
+                                ],
+                                "confidenceScore": 1.0
+                            }
+                        ]
+                    }
+                },
+                {
+                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
+                        "schemaName": "dataset_lineages",
+                        "platform": "urn:li:dataPlatform:looker",
+                        "version": 0,
+                        "created": {
+                            "time": 0,
+                            "actor": "urn:li:corpuser:unknown"
+                        },
+                        "lastModified": {
+                            "time": 0,
+                            "actor": "urn:li:corpuser:unknown"
+                        },
+                        "hash": "",
+                        "platformSchema": {
+                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
+                                "rawSchema": ""
+                            }
+                        },
+                        "fields": [
+                            {
+                                "fieldPath": "entity",
+                                "nullable": false,
+                                "description": "",
+                                "label": "",
+                                "type": {
+                                    "type": {
+                                        "com.linkedin.pegasus2avro.schema.StringType": {}
+                                    }
+                                },
+                                "nativeDataType": "string",
+                                "recursive": false,
+                                "globalTags": {
+                                    "tags": [
+                                        {
+                                            "tag": "urn:li:tag:Dimension"
+                                        }
+                                    ]
+                                },
+                                "isPartOfKey": false
+                            },
+                            {
+                                "fieldPath": "metadata",
+                                "nullable": false,
+                                "description": "",
+                                "label": "",
+                                "type": {
+                                    "type": {
+                                        "com.linkedin.pegasus2avro.schema.StringType": {}
+                                    }
+                                },
+                                "nativeDataType": "string",
+                                "recursive": false,
+                                "globalTags": {
+                                    "tags": [
+                                        {
+                                            "tag": "urn:li:tag:Dimension"
+                                        }
+                                    ]
+                                },
+                                "isPartOfKey": false
+                            },
+                            {
+                                "fieldPath": "urn",
+                                "nullable": false,
+                                "description": "",
+                                "label": "",
+                                "type": {
+                                    "type": {
+                                        "com.linkedin.pegasus2avro.schema.StringType": {}
+                                    }
+                                },
+                                "nativeDataType": "string",
+                                "recursive": false,
+                                "globalTags": {
+                                    "tags": [
+                                        {
+                                            "tag": "urn:li:tag:Dimension"
+                                        }
+                                    ]
+                                },
+                                "isPartOfKey": false
+                            },
+                            {
+                                "fieldPath": "version",
+                                "nullable": false,
+                                "description": "",
+                                "label": "",
+                                "type": {
+                                    "type": {
+                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
+                                    }
+                                },
+                                "nativeDataType": "number",
+                                "recursive": false,
+                                "globalTags": {
+                                    "tags": [
+                                        {
+                                            "tag": "urn:li:tag:Dimension"
+                                        }
+                                    ]
+                                },
+                                "isPartOfKey": false
+                            },
+                            {
+                                "fieldPath": "createdon",
+                                "nullable": false,
+                                "description": "",
+                                "label": "",
+                                "type": {
+                                    "type": {
+                                        "com.linkedin.pegasus2avro.schema.TimeType": {}
+                                    }
+                                },
+                                "nativeDataType": "time",
+                                "recursive": false,
+                                "globalTags": {
+                                    "tags": [
+                                        {
+                                            "tag": "urn:li:tag:Dimension"
+                                        },
+                                        {
+                                            "tag": "urn:li:tag:Temporal"
+                                        }
+                                    ]
+                                },
+                                "isPartOfKey": false
+                            },
+                            {
+                                "fieldPath": "count",
+                                "nullable": false,
+                                "description": "",
+                                "label": "",
+                                "type": {
+                                    "type": {
+                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
+                                    }
+                                },
+                                "nativeDataType": "count",
+                                "recursive": false,
+                                "globalTags": {
+                                    "tags": [
+                                        {
+                                            "tag": "urn:li:tag:Measure"
+                                        }
+                                    ]
+                                },
+                                "isPartOfKey": false
+                            }
+                        ],
+                        "primaryKeys": []
+                    }
+                },
+                {
+                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
+                        "customProperties": {
+                            "looker.file.path": "dataset_lineages.view.lkml",
+                            "looker.model": "data"
+                        },
+                        "name": "dataset_lineages",
+                        "tags": []
+                    }
+                }
+            ]
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "browsePathsV2",
+    "aspect": {
+        "json": {
+            "path": [
+                {
+                    "id": "Develop"
+                },
+                {
+                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
+                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
+                }
+            ]
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "tag",
+    "entityUrn": "urn:li:tag:Dimension",
+    "changeType": "UPSERT",
+    "aspectName": "tagKey",
+    "aspect": {
+        "json": {
+            "name": "Dimension"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "tag",
+    "entityUrn": "urn:li:tag:Measure",
+    "changeType": "UPSERT",
+    "aspectName": "tagKey",
+    "aspect": {
+        "json": {
+            "name": "Measure"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "tag",
+    "entityUrn": "urn:li:tag:Temporal",
+    "changeType": "UPSERT",
+    "aspectName": "tagKey",
+    "aspect": {
+        "json": {
+            "name": "Temporal"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1586847600000,
+        "runId": "lookml-test",
+        "lastRunId": "no-run-id-provided"
+    }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/data.model.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/data.model.lkml
new file mode 100644
index 0000000000000..ddad718721cfa
--- /dev/null
+++ b/metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/data.model.lkml
@@ -0,0 +1,7 @@
+connection: "my_connection"
+
+include: "dataset_lineages.view.lkml"
+
+explore: explore_dataset_lineage {
+    from: dataset_lineages
+}
diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/dataset_lineages.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/dataset_lineages.view.lkml
new file mode 100644
index 0000000000000..6062993f320d3
--- /dev/null
+++ b/metadata-ingestion/tests/integration/lookml/lkml_samples_duplicate_field/dataset_lineages.view.lkml
@@ -0,0 +1,50 @@
+# File was added to check duplicate field issue
+
+view: dataset_lineages {
+  sql_table_name: "PUBLIC"."DATASET_LINEAGES"
+    ;;
+
+  dimension: createdon {
+    type:  date
+    sql: ${TABLE}."CREATEDON" ;;
+  }
+
+  dimension_group: createdon {
+    type: time
+    timeframes: [
+      raw,
+      time,
+      date,
+      week,
+      month,
+      quarter,
+      year
+    ]
+    sql: ${TABLE}."CREATEDON" ;;
+  }
+
+  dimension: entity {
+    type: string
+    sql: ${TABLE}."ENTITY" ;;
+  }
+
+  dimension: metadata {
+    type: string
+    sql: ${TABLE}."METADATA" ;;
+  }
+
+  dimension: urn {
+    type: string
+    sql: ${TABLE}."URN" ;;
+  }
+
+  dimension: version {
+    type: number
+    sql: ${TABLE}."VERSION" ;;
+  }
+
+  measure: count {
+    type: count
+    drill_fields: []
+  }
+}
diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py
index b0d973a060390..5e0973a007f3a 100644
--- a/metadata-ingestion/tests/integration/lookml/test_lookml.py
+++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py
@@ -864,3 +864,26 @@ def test_manifest_parser(pytestconfig: pytest.Config) -> None:
 
     manifest = load_lkml(manifest_file)
     assert manifest
+
+
+@freeze_time(FROZEN_TIME)
+def test_duplicate_field_ingest(pytestconfig, tmp_path, mock_time):
+    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
+    mce_out_file = "duplicate_ingest_mces_output.json"
+
+    new_recipe = get_default_recipe(
+        f"{tmp_path}/{mce_out_file}",
+        f"{test_resources_dir}/lkml_samples_duplicate_field",
+    )
+
+    pipeline = Pipeline.create(new_recipe)
+    pipeline.run()
+    pipeline.pretty_print_summary()
+    pipeline.raise_from_status(raise_warnings=True)
+
+    golden_path = test_resources_dir / "duplicate_field_ingestion_golden.json"
+    mce_helpers.check_golden_file(
+        pytestconfig,
+        output_path=tmp_path / mce_out_file,
+        golden_path=golden_path,
+    )

From 7c4faf4896a2b3f776e6bae71fb31c014666d94c Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Wed, 15 May 2024 14:40:05 -0400
Subject: [PATCH 03/11] fix(graphql): Support querying Posts and Queries
 (#10502)

---
 .../types/entitytype/EntityTypeMapper.java    | 48 ++++++++++---------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java
index 48750082d3495..ffb14df5e800b 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java
@@ -15,37 +15,39 @@ public class EntityTypeMapper {
 
   static final Map<EntityType, String> ENTITY_TYPE_TO_NAME =
       ImmutableMap.<EntityType, String>builder()
-          .put(EntityType.DATASET, "dataset")
-          .put(EntityType.ROLE, "role")
-          .put(EntityType.CORP_USER, "corpuser")
-          .put(EntityType.CORP_GROUP, "corpGroup")
-          .put(EntityType.DATA_PLATFORM, "dataPlatform")
-          .put(EntityType.DASHBOARD, "dashboard")
-          .put(EntityType.CHART, "chart")
-          .put(EntityType.TAG, "tag")
-          .put(EntityType.DATA_FLOW, "dataFlow")
-          .put(EntityType.DATA_JOB, "dataJob")
+          .put(EntityType.DATASET, Constants.DATASET_ENTITY_NAME)
+          .put(EntityType.ROLE, Constants.ROLE_ENTITY_NAME)
+          .put(EntityType.CORP_USER, Constants.CORP_USER_ENTITY_NAME)
+          .put(EntityType.CORP_GROUP, Constants.CORP_GROUP_ENTITY_NAME)
+          .put(EntityType.DATA_PLATFORM, Constants.DATA_PLATFORM_ENTITY_NAME)
+          .put(EntityType.DASHBOARD, Constants.DASHBOARD_ENTITY_NAME)
+          .put(EntityType.CHART, Constants.CHART_ENTITY_NAME)
+          .put(EntityType.TAG, Constants.TAG_ENTITY_NAME)
+          .put(EntityType.DATA_FLOW, Constants.DATA_FLOW_ENTITY_NAME)
+          .put(EntityType.DATA_JOB, Constants.DATA_JOB_ENTITY_NAME)
           .put(EntityType.DATA_PROCESS_INSTANCE, Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME)
-          .put(EntityType.GLOSSARY_TERM, "glossaryTerm")
-          .put(EntityType.GLOSSARY_NODE, "glossaryNode")
-          .put(EntityType.MLMODEL, "mlModel")
-          .put(EntityType.MLMODEL_GROUP, "mlModelGroup")
-          .put(EntityType.MLFEATURE_TABLE, "mlFeatureTable")
-          .put(EntityType.MLFEATURE, "mlFeature")
-          .put(EntityType.MLPRIMARY_KEY, "mlPrimaryKey")
-          .put(EntityType.CONTAINER, "container")
-          .put(EntityType.DOMAIN, "domain")
-          .put(EntityType.NOTEBOOK, "notebook")
-          .put(EntityType.DATA_PLATFORM_INSTANCE, "dataPlatformInstance")
-          .put(EntityType.TEST, "test")
+          .put(EntityType.GLOSSARY_TERM, Constants.GLOSSARY_TERM_ENTITY_NAME)
+          .put(EntityType.GLOSSARY_NODE, Constants.GLOSSARY_NODE_ENTITY_NAME)
+          .put(EntityType.MLMODEL, Constants.ML_MODEL_ENTITY_NAME)
+          .put(EntityType.MLMODEL_GROUP, Constants.ML_MODEL_GROUP_ENTITY_NAME)
+          .put(EntityType.MLFEATURE_TABLE, Constants.ML_FEATURE_TABLE_ENTITY_NAME)
+          .put(EntityType.MLFEATURE, Constants.ML_FEATURE_ENTITY_NAME)
+          .put(EntityType.MLPRIMARY_KEY, Constants.ML_PRIMARY_KEY_ENTITY_NAME)
+          .put(EntityType.CONTAINER, Constants.CONTAINER_ENTITY_NAME)
+          .put(EntityType.DOMAIN, Constants.DOMAIN_ENTITY_NAME)
+          .put(EntityType.NOTEBOOK, Constants.NOTEBOOK_ENTITY_NAME)
+          .put(EntityType.DATA_PLATFORM_INSTANCE, Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME)
+          .put(EntityType.TEST, Constants.TEST_ENTITY_NAME)
           .put(EntityType.ER_MODEL_RELATIONSHIP, Constants.ER_MODEL_RELATIONSHIP_ENTITY_NAME)
           .put(EntityType.DATAHUB_VIEW, Constants.DATAHUB_VIEW_ENTITY_NAME)
           .put(EntityType.DATA_PRODUCT, Constants.DATA_PRODUCT_ENTITY_NAME)
-          .put(EntityType.SCHEMA_FIELD, "schemaField")
+          .put(EntityType.SCHEMA_FIELD, Constants.SCHEMA_FIELD_ENTITY_NAME)
           .put(EntityType.STRUCTURED_PROPERTY, Constants.STRUCTURED_PROPERTY_ENTITY_NAME)
           .put(EntityType.ASSERTION, Constants.ASSERTION_ENTITY_NAME)
           .put(EntityType.RESTRICTED, Constants.RESTRICTED_ENTITY_NAME)
           .put(EntityType.BUSINESS_ATTRIBUTE, Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME)
+          .put(EntityType.QUERY, Constants.QUERY_ENTITY_NAME)
+          .put(EntityType.POST, Constants.POST_ENTITY_NAME)
           .build();
 
   private static final Map<String, EntityType> ENTITY_NAME_TO_TYPE =

From a847748f8c54e5559dd5abd08747f7221d299a70 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Wed, 15 May 2024 14:10:32 -0500
Subject: [PATCH 04/11] fix(ebean): fix auto-closeable ebean dao streams
 (#10506)

---
 .../upgrade/restoreindices/SendMAEStep.java   |   3 +-
 .../upgrade/system/AbstractMCLStep.java       | 100 ++++++++++--------
 .../steps/BuildIndicesPreStep.java            |  54 ++++++----
 .../linkedin/metadata/entity/AspectDao.java   |   3 +-
 .../metadata/entity/EntityServiceImpl.java    |  53 +++++-----
 .../entity/cassandra/CassandraAspectDao.java  |   3 +-
 .../metadata/entity/ebean/EbeanAspectDao.java |  44 ++++----
 .../entity/ebean/PartitionedStream.java       |  29 +++++
 .../entity/EbeanAspectMigrationsDaoTest.java  |   9 +-
 .../metadata/entity/EntityServiceTest.java    |  55 ++++++----
 .../kafka/MceConsumerApplicationTest.java     |   6 +-
 .../elastic/OperationsController.java         |   6 +-
 .../metadata/resources/operations/Utils.java  |   3 +-
 .../metadata/entity/EntityService.java        |   3 +-
 14 files changed, 216 insertions(+), 155 deletions(-)
 create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/PartitionedStream.java

diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java
index d7a1882656245..77d988f3176f2 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java
@@ -49,7 +49,8 @@ public KafkaJob(UpgradeContext context, RestoreIndicesArgs args) {
     @Override
     public RestoreIndicesResult call() {
       return _entityService
-          .streamRestoreIndices(context.opContext(), args, context.report()::addLine)
+          .restoreIndices(context.opContext(), args, context.report()::addLine)
+          .stream()
           .findFirst()
           .get();
     }
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java
index 66cc90f60ed71..27e98259c8beb 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java
@@ -12,6 +12,8 @@
 import com.linkedin.metadata.entity.AspectDao;
 import com.linkedin.metadata.entity.EntityService;
 import com.linkedin.metadata.entity.EntityUtils;
+import com.linkedin.metadata.entity.ebean.EbeanAspectV2;
+import com.linkedin.metadata.entity.ebean.PartitionedStream;
 import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs;
 import com.linkedin.metadata.utils.AuditStampUtils;
 import com.linkedin.util.Pair;
@@ -76,54 +78,58 @@ public Function<UpgradeContext, UpgradeStepResult> executable() {
         args = args.urnLike(getUrnLike());
       }
 
-      aspectDao
-          .streamAspectBatches(args)
-          .forEach(
-              batch -> {
-                log.info("Processing batch({}) of size {}.", getAspectName(), batchSize);
-
-                List<Pair<Future<?>, Boolean>> futures =
-                    EntityUtils.toSystemAspectFromEbeanAspects(
-                            opContext.getRetrieverContext().get(),
-                            batch.collect(Collectors.toList()))
-                        .stream()
-                        .map(
-                            systemAspect ->
-                                entityService.alwaysProduceMCLAsync(
-                                    opContext,
-                                    systemAspect.getUrn(),
-                                    systemAspect.getUrn().getEntityType(),
-                                    getAspectName(),
-                                    systemAspect.getAspectSpec(),
-                                    null,
-                                    systemAspect.getRecordTemplate(),
-                                    null,
-                                    systemAspect
-                                        .getSystemMetadata()
-                                        .setRunId(id())
-                                        .setLastObserved(System.currentTimeMillis()),
-                                    AuditStampUtils.createDefaultAuditStamp(),
-                                    ChangeType.UPSERT))
-                        .collect(Collectors.toList());
-
-                futures.forEach(
-                    f -> {
-                      try {
-                        f.getFirst().get();
-                      } catch (InterruptedException | ExecutionException e) {
-                        throw new RuntimeException(e);
-                      }
-                    });
-
-                if (batchDelayMs > 0) {
-                  log.info("Sleeping for {} ms", batchDelayMs);
-                  try {
-                    Thread.sleep(batchDelayMs);
-                  } catch (InterruptedException e) {
-                    throw new RuntimeException(e);
+      try (PartitionedStream<EbeanAspectV2> stream = aspectDao.streamAspectBatches(args)) {
+        stream
+            .partition(args.batchSize)
+            .forEach(
+                batch -> {
+                  log.info("Processing batch({}) of size {}.", getAspectName(), batchSize);
+
+                  List<Pair<Future<?>, Boolean>> futures;
+
+                  futures =
+                      EntityUtils.toSystemAspectFromEbeanAspects(
+                              opContext.getRetrieverContext().get(),
+                              batch.collect(Collectors.toList()))
+                          .stream()
+                          .map(
+                              systemAspect ->
+                                  entityService.alwaysProduceMCLAsync(
+                                      opContext,
+                                      systemAspect.getUrn(),
+                                      systemAspect.getUrn().getEntityType(),
+                                      getAspectName(),
+                                      systemAspect.getAspectSpec(),
+                                      null,
+                                      systemAspect.getRecordTemplate(),
+                                      null,
+                                      systemAspect
+                                          .getSystemMetadata()
+                                          .setRunId(id())
+                                          .setLastObserved(System.currentTimeMillis()),
+                                      AuditStampUtils.createDefaultAuditStamp(),
+                                      ChangeType.UPSERT))
+                          .collect(Collectors.toList());
+
+                  futures.forEach(
+                      f -> {
+                        try {
+                          f.getFirst().get();
+                        } catch (InterruptedException | ExecutionException e) {
+                          throw new RuntimeException(e);
+                        }
+                      });
+
+                  if (batchDelayMs > 0) {
+                    log.info("Sleeping for {} ms", batchDelayMs);
+                    try {
+                      Thread.sleep(batchDelayMs);
+                    } catch (InterruptedException e) {
+                      throw new RuntimeException(e);
+                    }
                   }
-                }
-              });
+                });
+      }
 
       BootstrapStep.setUpgradeResult(opContext, getUpgradeIdUrn(), entityService);
       context.report().addLine("State updated: " + getUpgradeIdUrn());
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java
index 0695dbe4b1acb..c3c9981b1dd7e 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/elasticsearch/steps/BuildIndicesPreStep.java
@@ -17,6 +17,7 @@
 import com.linkedin.gms.factory.config.ConfigurationProvider;
 import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory;
 import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityAspect;
 import com.linkedin.metadata.models.registry.EntityRegistry;
 import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig;
 import com.linkedin.metadata.shared.ElasticSearchIndexed;
@@ -28,6 +29,7 @@
 import java.util.Set;
 import java.util.function.Function;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.opensearch.OpenSearchStatusException;
@@ -156,28 +158,34 @@ private boolean blockWrites(String indexName) throws InterruptedException, IOExc
 
   private static Set<StructuredPropertyDefinition> getActiveStructuredPropertiesDefinitions(
       AspectDao aspectDao) {
-    Set<String> removedStructuredPropertyUrns =
-        aspectDao
-            .streamAspects(STRUCTURED_PROPERTY_ENTITY_NAME, STATUS_ASPECT_NAME)
-            .map(
-                entityAspect ->
-                    Pair.of(
-                        entityAspect.getUrn(),
-                        RecordUtils.toRecordTemplate(Status.class, entityAspect.getMetadata())))
-            .filter(status -> status.getSecond().isRemoved())
-            .map(Pair::getFirst)
-            .collect(Collectors.toSet());
-
-    return aspectDao
-        .streamAspects(STRUCTURED_PROPERTY_ENTITY_NAME, STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)
-        .map(
-            entityAspect ->
-                Pair.of(
-                    entityAspect.getUrn(),
-                    RecordUtils.toRecordTemplate(
-                        StructuredPropertyDefinition.class, entityAspect.getMetadata())))
-        .filter(definition -> !removedStructuredPropertyUrns.contains(definition.getKey()))
-        .map(Pair::getSecond)
-        .collect(Collectors.toSet());
+    Set<String> removedStructuredPropertyUrns;
+    try (Stream<EntityAspect> stream =
+        aspectDao.streamAspects(STRUCTURED_PROPERTY_ENTITY_NAME, STATUS_ASPECT_NAME)) {
+      removedStructuredPropertyUrns =
+          stream
+              .map(
+                  entityAspect ->
+                      Pair.of(
+                          entityAspect.getUrn(),
+                          RecordUtils.toRecordTemplate(Status.class, entityAspect.getMetadata())))
+              .filter(status -> status.getSecond().isRemoved())
+              .map(Pair::getFirst)
+              .collect(Collectors.toSet());
+    }
+
+    try (Stream<EntityAspect> stream =
+        aspectDao.streamAspects(
+            STRUCTURED_PROPERTY_ENTITY_NAME, STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)) {
+      return stream
+          .map(
+              entityAspect ->
+                  Pair.of(
+                      entityAspect.getUrn(),
+                      RecordUtils.toRecordTemplate(
+                          StructuredPropertyDefinition.class, entityAspect.getMetadata())))
+          .filter(definition -> !removedStructuredPropertyUrns.contains(definition.getKey()))
+          .map(Pair::getSecond)
+          .collect(Collectors.toSet());
+    }
   }
 }
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java
index e836b69ef4305..646b995f87d00 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java
@@ -3,6 +3,7 @@
 import com.linkedin.common.urn.Urn;
 import com.linkedin.metadata.aspect.batch.AspectsBatch;
 import com.linkedin.metadata.entity.ebean.EbeanAspectV2;
+import com.linkedin.metadata.entity.ebean.PartitionedStream;
 import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs;
 import com.linkedin.metadata.utils.metrics.MetricUtils;
 import io.ebean.Transaction;
@@ -105,7 +106,7 @@ ListResult<String> listUrns(
   Integer countAspect(@Nonnull final String aspectName, @Nullable String urnLike);
 
   @Nonnull
-  Stream<Stream<EbeanAspectV2>> streamAspectBatches(final RestoreIndicesArgs args);
+  PartitionedStream<EbeanAspectV2> streamAspectBatches(final RestoreIndicesArgs args);
 
   @Nonnull
   Stream<EntityAspect> streamAspects(String entityName, String aspectName);
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
index 0093921a83f9e..353b83726611e 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
@@ -48,6 +48,8 @@
 import com.linkedin.metadata.aspect.plugins.validation.ValidationExceptionCollection;
 import com.linkedin.metadata.aspect.utils.DefaultAspectsUtil;
 import com.linkedin.metadata.config.PreProcessHooks;
+import com.linkedin.metadata.entity.ebean.EbeanAspectV2;
+import com.linkedin.metadata.entity.ebean.PartitionedStream;
 import com.linkedin.metadata.entity.ebean.batch.AspectsBatchImpl;
 import com.linkedin.metadata.entity.ebean.batch.ChangeItemImpl;
 import com.linkedin.metadata.entity.ebean.batch.DeleteItemImpl;
@@ -1248,7 +1250,7 @@ public Integer getCountAspect(
 
   @Nonnull
   @Override
-  public Stream<RestoreIndicesResult> streamRestoreIndices(
+  public List<RestoreIndicesResult> restoreIndices(
       @Nonnull OperationContext opContext,
       @Nonnull RestoreIndicesArgs args,
       @Nonnull Consumer<String> logger) {
@@ -1257,32 +1259,35 @@ public Stream<RestoreIndicesResult> streamRestoreIndices(
     logger.accept(
         String.format(
             "Reading rows %s through %s (0 == infinite) in batches of %s from the aspects table started.",
-            args.start, args.limit, args.batchSize));
+            args.start, args.start + args.limit, args.batchSize));
 
     long startTime = System.currentTimeMillis();
-    return aspectDao
-        .streamAspectBatches(args)
-        .map(
-            batchStream -> {
-              long timeSqlQueryMs = System.currentTimeMillis() - startTime;
 
-              List<SystemAspect> systemAspects =
-                  EntityUtils.toSystemAspectFromEbeanAspects(
-                      opContext.getRetrieverContext().get(),
-                      batchStream.collect(Collectors.toList()));
-
-              RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger);
-              result.timeSqlQueryMs = timeSqlQueryMs;
-
-              logger.accept("Batch completed.");
-              try {
-                TimeUnit.MILLISECONDS.sleep(args.batchDelayMs);
-              } catch (InterruptedException e) {
-                throw new RuntimeException(
-                    "Thread interrupted while sleeping after successful batch migration.");
-              }
-              return result;
-            });
+    try (PartitionedStream<EbeanAspectV2> stream = aspectDao.streamAspectBatches(args)) {
+      return stream
+          .partition(args.batchSize)
+          .map(
+              batch -> {
+                long timeSqlQueryMs = System.currentTimeMillis() - startTime;
+
+                List<SystemAspect> systemAspects =
+                    EntityUtils.toSystemAspectFromEbeanAspects(
+                        opContext.getRetrieverContext().get(), batch.collect(Collectors.toList()));
+
+                RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger);
+                result.timeSqlQueryMs = timeSqlQueryMs;
+
+                logger.accept("Batch completed.");
+                try {
+                  TimeUnit.MILLISECONDS.sleep(args.batchDelayMs);
+                } catch (InterruptedException e) {
+                  throw new RuntimeException(
+                      "Thread interrupted while sleeping after successful batch migration.");
+                }
+                return result;
+              })
+          .collect(Collectors.toList());
+    }
   }
 
   @Nonnull
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java
index c1e9b4207def6..15c37b6c0085f 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java
@@ -30,6 +30,7 @@
 import com.linkedin.metadata.entity.EntityAspectIdentifier;
 import com.linkedin.metadata.entity.ListResult;
 import com.linkedin.metadata.entity.ebean.EbeanAspectV2;
+import com.linkedin.metadata.entity.ebean.PartitionedStream;
 import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs;
 import com.linkedin.metadata.query.ExtraInfo;
 import com.linkedin.metadata.query.ExtraInfoArray;
@@ -491,7 +492,7 @@ public Integer countAspect(@Nonnull String aspectName, @Nullable String urnLike)
   }
 
   @Nonnull
-  public Stream<Stream<EbeanAspectV2>> streamAspectBatches(final RestoreIndicesArgs args) {
+  public PartitionedStream<EbeanAspectV2> streamAspectBatches(final RestoreIndicesArgs args) {
     // Not implemented
     return null;
   }
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
index 4d5d51cb0ce7b..9725abdf7fdc2 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
@@ -8,7 +8,6 @@
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.CacheLoader;
 import com.google.common.cache.LoadingCache;
-import com.google.common.collect.Iterators;
 import com.linkedin.common.AuditStamp;
 import com.linkedin.common.urn.Urn;
 import com.linkedin.metadata.aspect.RetrieverContext;
@@ -49,7 +48,6 @@
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -61,7 +59,6 @@
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
-import java.util.stream.StreamSupport;
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 import javax.persistence.PersistenceException;
@@ -497,9 +494,15 @@ public Integer countAspect(@Nonnull String aspectName, @Nullable String urnLike)
     return exp.findCount();
   }
 
+  /**
+   * Warning this inner Streams must be closed
+   *
+   * @param args
+   * @return
+   */
   @Nonnull
   @Override
-  public Stream<Stream<EbeanAspectV2>> streamAspectBatches(final RestoreIndicesArgs args) {
+  public PartitionedStream<EbeanAspectV2> streamAspectBatches(final RestoreIndicesArgs args) {
     ExpressionList<EbeanAspectV2> exp =
         _server
             .find(EbeanAspectV2.class)
@@ -548,25 +551,24 @@ public Stream<Stream<EbeanAspectV2>> streamAspectBatches(final RestoreIndicesArg
       exp = exp.setMaxRows(args.limit);
     }
 
-    return partition(
-        exp.orderBy()
-            .asc(EbeanAspectV2.URN_COLUMN)
-            .orderBy()
-            .asc(EbeanAspectV2.ASPECT_COLUMN)
-            .setFirstRow(start)
-            .findStream(),
-        args.batchSize);
-  }
-
-  private static <T> Stream<Stream<T>> partition(Stream<T> source, int size) {
-    final Iterator<T> it = source.iterator();
-    final Iterator<Stream<T>> partIt =
-        Iterators.transform(Iterators.partition(it, size), List::stream);
-    final Iterable<Stream<T>> iterable = () -> partIt;
-
-    return StreamSupport.stream(iterable.spliterator(), false);
+    return PartitionedStream.<EbeanAspectV2>builder()
+        .delegateStream(
+            exp.orderBy()
+                .asc(EbeanAspectV2.URN_COLUMN)
+                .orderBy()
+                .asc(EbeanAspectV2.ASPECT_COLUMN)
+                .setFirstRow(start)
+                .findStream())
+        .build();
   }
 
+  /**
+   * Warning the stream must be closed
+   *
+   * @param entityName
+   * @param aspectName
+   * @return
+   */
   @Override
   @Nonnull
   public Stream<EntityAspect> streamAspects(String entityName, String aspectName) {
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/PartitionedStream.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/PartitionedStream.java
new file mode 100644
index 0000000000000..1b7a856fb9729
--- /dev/null
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/PartitionedStream.java
@@ -0,0 +1,29 @@
+package com.linkedin.metadata.entity.ebean;
+
+import com.google.common.collect.Iterators;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+import javax.annotation.Nonnull;
+import lombok.Builder;
+import lombok.experimental.Accessors;
+
+@Builder
+@Accessors(fluent = true)
+public class PartitionedStream<T> implements AutoCloseable {
+  @Nonnull private final Stream<T> delegateStream;
+
+  public Stream<Stream<T>> partition(int size) {
+    final Iterator<T> it = delegateStream.iterator();
+    final Iterator<Stream<T>> partIt =
+        Iterators.transform(Iterators.partition(it, size), List::stream);
+    final Iterable<Stream<T>> iterable = () -> partIt;
+    return StreamSupport.stream(iterable.spliterator(), false);
+  }
+
+  @Override
+  public void close() {
+    delegateStream.close();
+  }
+}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java
index 683120929c8ec..5ab7e686cb671 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanAspectMigrationsDaoTest.java
@@ -53,9 +53,12 @@ public void testStreamAspects() throws AssertionError {
     List<String> ingestedUrns =
         ingestedAspects.keySet().stream().map(Urn::toString).collect(Collectors.toList());
 
-    Stream<EntityAspect> aspectStream =
-        _migrationsDao.streamAspects(CORP_USER_ENTITY_NAME, CORP_USER_KEY_ASPECT_NAME);
-    List<EntityAspect> aspectList = aspectStream.collect(Collectors.toList());
+    List<EntityAspect> aspectList;
+    try (Stream<EntityAspect> stream =
+        _migrationsDao.streamAspects(CORP_USER_ENTITY_NAME, CORP_USER_KEY_ASPECT_NAME)) {
+      aspectList = stream.collect(Collectors.toList());
+    }
+
     assertEquals(ingestedUrns.size(), aspectList.size());
     Set<String> urnsFetched =
         aspectList.stream().map(EntityAspect::getUrn).collect(Collectors.toSet());
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
index feacc24423edb..6086f02b713bb 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
@@ -84,6 +84,7 @@
 import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.junit.Assert;
 import org.mockito.ArgumentCaptor;
 import org.mockito.Mockito;
@@ -1706,9 +1707,7 @@ public void testRestoreIndices() throws Exception {
       args.batchDelayMs(1L);
       args.numThreads(1);
       args.urn(urnStr);
-      _entityServiceImpl
-          .streamRestoreIndices(opContext, args, obj -> {})
-          .collect(Collectors.toList());
+      _entityServiceImpl.restoreIndices(opContext, args, obj -> {});
 
       ArgumentCaptor<MetadataChangeLog> mclCaptor =
           ArgumentCaptor.forClass(MetadataChangeLog.class);
@@ -1907,16 +1906,22 @@ public void testStructuredPropertyIngestProposal() throws Exception {
     assertEquals(
         _entityServiceImpl.getAspect(opContext, firstPropertyUrn, definitionAspectName, 0),
         structuredPropertyDefinition);
-    Set<StructuredPropertyDefinition> defs =
-        _aspectDao
-            .streamAspects(
-                STRUCTURED_PROPERTY_ENTITY_NAME, STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)
-            .map(
-                entityAspect ->
-                    EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), entityAspect)
-                        .get()
-                        .getAspect(StructuredPropertyDefinition.class))
-            .collect(Collectors.toSet());
+
+    Set<StructuredPropertyDefinition> defs;
+    try (Stream<EntityAspect> stream =
+        _aspectDao.streamAspects(
+            STRUCTURED_PROPERTY_ENTITY_NAME, STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)) {
+      defs =
+          stream
+              .map(
+                  entityAspect ->
+                      EntityUtils.toSystemAspect(
+                              opContext.getRetrieverContext().get(), entityAspect)
+                          .get()
+                          .getAspect(StructuredPropertyDefinition.class))
+              .collect(Collectors.toSet());
+    }
+
     assertEquals(defs.size(), 1);
     assertEquals(defs, Set.of(structuredPropertyDefinition));
 
@@ -1983,16 +1988,20 @@ public void testStructuredPropertyIngestProposal() throws Exception {
     assertEquals(
         _entityServiceImpl.getAspect(opContext, secondPropertyUrn, definitionAspectName, 0),
         secondDefinition);
-    defs =
-        _aspectDao
-            .streamAspects(
-                STRUCTURED_PROPERTY_ENTITY_NAME, STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)
-            .map(
-                entityAspect ->
-                    EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), entityAspect)
-                        .get()
-                        .getAspect(StructuredPropertyDefinition.class))
-            .collect(Collectors.toSet());
+    try (Stream<EntityAspect> stream =
+        _aspectDao.streamAspects(
+            STRUCTURED_PROPERTY_ENTITY_NAME, STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)) {
+      defs =
+          stream
+              .map(
+                  entityAspect ->
+                      EntityUtils.toSystemAspect(
+                              opContext.getRetrieverContext().get(), entityAspect)
+                          .get()
+                          .getAspect(StructuredPropertyDefinition.class))
+              .collect(Collectors.toSet());
+    }
+
     assertEquals(defs.size(), 2);
     assertEquals(defs, Set.of(secondDefinition, structuredPropertyDefinition));
 
diff --git a/metadata-jobs/mce-consumer-job/src/test/java/com/linkedin/metadata/kafka/MceConsumerApplicationTest.java b/metadata-jobs/mce-consumer-job/src/test/java/com/linkedin/metadata/kafka/MceConsumerApplicationTest.java
index 7b3716a894683..30bfeadb021a7 100644
--- a/metadata-jobs/mce-consumer-job/src/test/java/com/linkedin/metadata/kafka/MceConsumerApplicationTest.java
+++ b/metadata-jobs/mce-consumer-job/src/test/java/com/linkedin/metadata/kafka/MceConsumerApplicationTest.java
@@ -8,7 +8,7 @@
 import com.linkedin.metadata.entity.restoreindices.RestoreIndicesResult;
 import io.datahubproject.metadata.context.OperationContext;
 import io.datahubproject.metadata.jobs.common.health.kafka.KafkaHealthIndicator;
-import java.util.stream.Stream;
+import java.util.List;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.boot.test.context.SpringBootTest;
 import org.springframework.boot.test.web.client.TestRestTemplate;
@@ -32,8 +32,8 @@ public class MceConsumerApplicationTest extends AbstractTestNGSpringContextTests
   public void testRestliServletConfig() {
     RestoreIndicesResult mockResult = new RestoreIndicesResult();
     mockResult.setRowsMigrated(100);
-    when(_mockEntityService.streamRestoreIndices(any(OperationContext.class), any(), any()))
-        .thenReturn(Stream.of(mockResult));
+    when(_mockEntityService.restoreIndices(any(OperationContext.class), any(), any()))
+        .thenReturn(List.of(mockResult));
 
     String response =
         this.restTemplate.postForObject(
diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java
index a0191acfe5fed..1718beeaeaba3 100644
--- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java
+++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java
@@ -298,11 +298,7 @@ public ResponseEntity<List<RestoreIndicesResult>> restoreIndices(
             .gePitEpochMs(gePitEpochMs)
             .lePitEpochMs(lePitEpochMs);
 
-    return ResponseEntity.of(
-        Optional.of(
-            entityService
-                .streamRestoreIndices(opContext, args, log::info)
-                .collect(Collectors.toList())));
+    return ResponseEntity.of(Optional.of(entityService.restoreIndices(opContext, args, log::info)));
   }
 
   @Tag(name = "RestoreIndices")
diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java
index 78db69a91df5f..2c411f9ad960e 100644
--- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java
+++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java
@@ -74,7 +74,8 @@ public static String restoreIndices(
     Map<String, Object> result = new HashMap<>();
     result.put("args", args);
     result.put("result", entityService
-            .streamRestoreIndices(opContext, args, log::info)
+            .restoreIndices(opContext, args, log::info)
+            .stream()
             .map(RestoreIndicesResult::toString)
             .collect(Collectors.joining("\n")));
     return result.toString();
diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java
index 5250f06bddae0..0794ba72ff692 100644
--- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java
+++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java
@@ -29,7 +29,6 @@
 import java.util.Set;
 import java.util.concurrent.Future;
 import java.util.function.Consumer;
-import java.util.stream.Stream;
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 
@@ -298,7 +297,7 @@ Integer getCountAspect(
       @Nonnull OperationContext opContext, @Nonnull String aspectName, @Nullable String urnLike);
 
   // TODO: Extract this to a different service, doesn't need to be here
-  Stream<RestoreIndicesResult> streamRestoreIndices(
+  List<RestoreIndicesResult> restoreIndices(
       @Nonnull OperationContext opContext,
       @Nonnull RestoreIndicesArgs args,
       @Nonnull Consumer<String> logger);

From c1e7574b5793e6fe59492f368e16b5135243244e Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 15 May 2024 13:07:22 -0700
Subject: [PATCH 05/11] feat(ingest/airflow): support BigQueryInsertJobOperator
 (#10452)

---
 docs/lineage/airflow.md                       | 11 +++-
 .../src/datahub_airflow_plugin/_extractors.py | 60 ++++++++++++++++---
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md
index d501ea407c072..f0952309c328a 100644
--- a/docs/lineage/airflow.md
+++ b/docs/lineage/airflow.md
@@ -8,7 +8,7 @@ If you're looking to schedule DataHub ingestion using Airflow, see the guide on
 
 The DataHub Airflow plugin supports:
 
-- Automatic column-level lineage extraction from various operators e.g. SQL operators (including `MySqlOperator`, `PostgresOperator`, `SnowflakeOperator`, and more), `S3FileTransformOperator`, and more.
+- Automatic column-level lineage extraction from various operators e.g. SQL operators (including `MySqlOperator`, `PostgresOperator`, `SnowflakeOperator`, `BigQueryInsertJobOperator`, and more), `S3FileTransformOperator`, and more.
 - Airflow DAG and tasks, including properties, ownership, and tags.
 - Task run information, including task successes and failures.
 - Manual lineage annotations using `inlets` and `outlets` on Airflow operators.
@@ -166,6 +166,7 @@ Supported operators:
 - `SQLExecuteQueryOperator`, including any subclasses. Note that in newer versions of Airflow (generally Airflow 2.5+), most SQL operators inherit from this class.
 - `AthenaOperator` and `AWSAthenaOperator`
 - `BigQueryOperator` and `BigQueryExecuteQueryOperator`
+- `BigQueryInsertJobOperator` (incubating)
 - `MySqlOperator`
 - `PostgresOperator`
 - `RedshiftSQLOperator`
@@ -224,6 +225,14 @@ class DbtOperator(BaseOperator):
 
 If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. Reference the [Airflow docs](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) for more details.
 
+### Custom Extractors
+
+Note: these are only supported in the v2 plugin.
+
+You can also create a custom extractor to extract lineage from any operator. This is useful if you're using a built-in Airflow operator for which we don't support automatic lineage extraction.
+
+See this [example PR](https://github.com/datahub-project/datahub/pull/10452) which adds a custom extractor for the `BigQueryInsertJobOperator` operator.
+
 ## Emit Lineage Directly
 
 If you can't use the plugin or annotate inlets/outlets, you can also emit lineage using the `DatahubEmitterOperator`.
diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py
index 197ae5298aa83..f91c77591d35b 100644
--- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py
+++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py
@@ -59,6 +59,10 @@ def __init__(self):
         for operator in _sql_operator_overrides:
             self.task_to_extractor.extractors[operator] = GenericSqlExtractor
 
+        self.task_to_extractor.extractors[
+            "BigQueryInsertJobOperator"
+        ] = BigQueryInsertJobOperatorExtractor
+
         self._graph: Optional["DataHubGraph"] = None
 
     @contextlib.contextmanager
@@ -78,7 +82,7 @@ def _patch_extractors(self):
                 unittest.mock.patch.object(
                     SnowflakeExtractor,
                     "default_schema",
-                    property(snowflake_default_schema),
+                    property(_snowflake_default_schema),
                 )
             )
 
@@ -166,12 +170,6 @@ def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata:
     task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
     sql = self.operator.sql
 
-    run_facets = {}
-    job_facets = {"sql": SqlJobFacet(query=self._normalize_sql(sql))}
-
-    # Prepare to run the SQL parser.
-    graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None)
-
     default_database = getattr(self.operator, "database", None)
     if not default_database:
         default_database = self.database
@@ -185,6 +183,31 @@ def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata:
     # Run the SQL parser.
     scheme = self.scheme
     platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
+
+    return _parse_sql_into_task_metadata(
+        self,
+        sql,
+        platform=platform,
+        default_database=default_database,
+        default_schema=default_schema,
+    )
+
+
+def _parse_sql_into_task_metadata(
+    self: "BaseExtractor",
+    sql: str,
+    platform: str,
+    default_database: Optional[str],
+    default_schema: Optional[str],
+) -> TaskMetadata:
+    task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
+
+    run_facets = {}
+    job_facets = {"sql": SqlJobFacet(query=self._normalize_sql(sql))}
+
+    # Prepare to run the SQL parser.
+    graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None)
+
     self.log.debug(
         "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
         "with graph client" if graph else "in offline mode",
@@ -232,7 +255,28 @@ def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata:
     )
 
 
-def snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
+class BigQueryInsertJobOperatorExtractor(BaseExtractor):
+    def extract(self) -> Optional[TaskMetadata]:
+        from airflow.providers.google.cloud.operators.bigquery import (
+            BigQueryInsertJobOperator,  # type: ignore
+        )
+
+        operator: "BigQueryInsertJobOperator" = self.operator
+        sql = operator.configuration.get("query")
+        if not sql:
+            self.log.warning("No query found in BigQueryInsertJobOperator")
+            return None
+
+        return _parse_sql_into_task_metadata(
+            self,
+            sql,
+            platform="bigquery",
+            default_database=operator.project_id,
+            default_schema=None,
+        )
+
+
+def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
     if hasattr(self.operator, "schema") and self.operator.schema is not None:
         return self.operator.schema
     return (

From 047bfcc4126827178e0d9d02e5b62bd6b9610473 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 15 May 2024 13:08:27 -0700
Subject: [PATCH 06/11] fix(ingest): avoid using `_inner_dict` in urn iterator
 (#10492)

---
 metadata-ingestion/setup.cfg                              | 1 +
 metadata-ingestion/src/datahub/utilities/urns/urn_iter.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg
index 25ece8ac11ef0..16af43abe3be7 100644
--- a/metadata-ingestion/setup.cfg
+++ b/metadata-ingestion/setup.cfg
@@ -94,6 +94,7 @@ filterwarnings =
     ignore:pkg_resources is deprecated as an API:DeprecationWarning
     ignore:Did not recognize type:sqlalchemy.exc.SAWarning
     ignore::datahub.configuration.pydantic_migration_helpers.PydanticDeprecatedSince20
+    ignore::datahub.configuration.common.ConfigurationWarning
 
 [coverage:run]
 # Because of some quirks in the way setup.cfg, coverage.py, pytest-cov,
diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py
index 3389a6fb05ee8..5bef17119675e 100644
--- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py
+++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py
@@ -118,14 +118,15 @@ def _modify_at_path(
             assert isinstance(model, list)
             model[path[0]] = new_value
         elif isinstance(model, DictWrapper):
-            model._inner_dict[path[0]] = new_value
+            setattr(model, path[0], new_value)
         else:  # MCPW
             setattr(model, path[0], new_value)
     elif isinstance(path[0], int):
         assert isinstance(model, list)
         _modify_at_path(model[path[0]], path[1:], new_value)
     elif isinstance(model, DictWrapper):
-        _modify_at_path(model._inner_dict[path[0]], path[1:], new_value)
+        item = getattr(model, path[0])
+        _modify_at_path(item, path[1:], new_value)
     else:  # MCPW
         _modify_at_path(getattr(model, path[0]), path[1:], new_value)
 

From 56054e8b143b39680ca0152672042a7cbcae8fec Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 15 May 2024 13:11:48 -0700
Subject: [PATCH 07/11] fix(ingest/snowflake): use block sampling more
 conservatively (#10494)

---
 .../source/snowflake/snowflake_profiler.py    | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
index 4a73d26e11eaf..5e6ade29344eb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
@@ -102,15 +102,26 @@ def get_batch_kwargs(
             # We are using fraction-based sampling here, instead of fixed-size sampling because
             # Fixed-size sampling can be slower than equivalent fraction-based sampling
             # as per https://docs.snowflake.com/en/sql-reference/constructs/sample#performance-considerations
-
-            sample_method = "BERNOULLI"
-            if table.rows_count > self.config.profiling.sample_size * 1000:
-                # If the table is significantly larger than the sample size, we use BLOCK
-                # sampling for better performance.
-                sample_method = "BLOCK"
-
-            sample_pc = 100 * self.config.profiling.sample_size / table.rows_count
-            custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE {sample_method} ({sample_pc:.8f})'
+            estimated_block_row_count = 500_000
+            block_profiling_min_rows = 100 * estimated_block_row_count
+
+            tablename = f'"{db_name}"."{schema_name}"."{table.name}"'
+            sample_pc = self.config.profiling.sample_size / table.rows_count
+
+            overgeneration_factor = 1000
+            if (
+                table.rows_count > block_profiling_min_rows
+                and table.rows_count
+                > self.config.profiling.sample_size * overgeneration_factor
+            ):
+                # If the table is significantly larger than the sample size, do a first pass
+                # using block sampling to improve performance. We generate a table 1000 times
+                # larger than the target sample size, and then use normal sampling for the
+                # final size reduction.
+                tablename = f"(SELECT * FROM {tablename} TABLESAMPLE BLOCK ({100 * overgeneration_factor * sample_pc:.8f}))"
+                sample_pc = 1 / overgeneration_factor
+
+            custom_sql = f"select * from {tablename} TABLESAMPLE BERNOULLI ({100 * sample_pc:.8f})"
         return {
             **super().get_batch_kwargs(table, schema_name, db_name),
             # Lowercase/Mixedcase table names in Snowflake do not work by default.

From 334f4311195aea32af9321420f330397a23f1ede Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 15 May 2024 13:15:02 -0700
Subject: [PATCH 08/11] feat(sdk): add DataHubGraph.get_timeseries_values()
 method (#10501)

---
 .../src/datahub/ingestion/graph/client.py     | 37 +++++++++++++++----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index 859b150757cdf..c6b2c8aad82e9 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -419,26 +419,47 @@ def get_latest_timeseries_value(
             {"field": k, "value": v, "condition": "EQUAL"}
             for k, v in filter_criteria_map.items()
         ]
+        filter = {"or": [{"and": filter_criteria}]}
+
+        values = self.get_timeseries_values(
+            entity_urn=entity_urn, aspect_type=aspect_type, filter=filter, limit=1
+        )
+        if not values:
+            return None
+
+        assert len(values) == 1, len(values)
+        return values[0]
+
+    def get_timeseries_values(
+        self,
+        entity_urn: str,
+        aspect_type: Type[Aspect],
+        filter: Dict[str, Any],
+        limit: int = 10,
+    ) -> List[Aspect]:
         query_body = {
             "urn": entity_urn,
             "entity": guess_entity_type(entity_urn),
             "aspect": aspect_type.ASPECT_NAME,
-            "limit": 1,
-            "filter": {"or": [{"and": filter_criteria}]},
+            "limit": limit,
+            "filter": filter,
         }
         end_point = f"{self.config.server}/aspects?action=getTimeseriesAspectValues"
         resp: Dict = self._post_generic(end_point, query_body)
-        values: list = resp.get("value", {}).get("values")
-        if values:
-            assert len(values) == 1, len(values)
-            aspect_json: str = values[0].get("aspect", {}).get("value")
+
+        values: Optional[List] = resp.get("value", {}).get("values")
+        aspects: List[Aspect] = []
+        for value in values or []:
+            aspect_json: str = value.get("aspect", {}).get("value")
             if aspect_json:
-                return aspect_type.from_obj(json.loads(aspect_json), tuples=False)
+                aspects.append(
+                    aspect_type.from_obj(json.loads(aspect_json), tuples=False)
+                )
             else:
                 raise GraphError(
                     f"Failed to find {aspect_type} in response {aspect_json}"
                 )
-        return None
+        return aspects
 
     def get_entity_raw(
         self, entity_urn: str, aspects: Optional[List[str]] = None

From 14eaa06ae7b0aec36537b6e91d83f97b33f71b78 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Wed, 15 May 2024 15:35:37 -0500
Subject: [PATCH 09/11] fix(mcp): fix mcp key aspect (#10503)

---
 .../entity/ebean/batch/ChangeItemImpl.java    | 23 ++++++++++++-------
 .../metadata/entity/EntityServiceTest.java    | 16 +++++++++++++
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java
index 2f3bce6e75e14..30e9251982f10 100644
--- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java
+++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java
@@ -99,14 +99,21 @@ public SystemAspect getSystemAspect(@Nullable Long version) {
 
   @Nonnull
   public MetadataChangeProposal getMetadataChangeProposal() {
-    final MetadataChangeProposal mcp = new MetadataChangeProposal();
-    mcp.setEntityUrn(getUrn());
-    mcp.setChangeType(getChangeType());
-    mcp.setEntityType(getEntitySpec().getName());
-    mcp.setAspectName(getAspectName());
-    mcp.setAspect(GenericRecordUtils.serializeAspect(getRecordTemplate()));
-    mcp.setSystemMetadata(getSystemMetadata());
-    return mcp;
+    if (metadataChangeProposal != null) {
+      return metadataChangeProposal;
+    } else {
+      final MetadataChangeProposal mcp = new MetadataChangeProposal();
+      mcp.setEntityUrn(getUrn());
+      mcp.setChangeType(getChangeType());
+      mcp.setEntityType(getEntitySpec().getName());
+      mcp.setAspectName(getAspectName());
+      mcp.setAspect(GenericRecordUtils.serializeAspect(getRecordTemplate()));
+      mcp.setSystemMetadata(getSystemMetadata());
+      mcp.setEntityKeyAspect(
+          GenericRecordUtils.serializeAspect(
+              EntityKeyUtils.convertUrnToEntityKey(getUrn(), entitySpec.getKeyAspectSpec())));
+      return mcp;
+    }
   }
 
   public static class ChangeItemImplBuilder {
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
index 6086f02b713bb..45d4fe4f46c99 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
@@ -58,6 +58,7 @@
 import com.linkedin.metadata.service.UpdateIndicesService;
 import com.linkedin.metadata.snapshot.CorpUserSnapshot;
 import com.linkedin.metadata.snapshot.Snapshot;
+import com.linkedin.metadata.utils.EntityKeyUtils;
 import com.linkedin.metadata.utils.GenericRecordUtils;
 import com.linkedin.mxe.GenericAspect;
 import com.linkedin.mxe.MetadataChangeLog;
@@ -534,6 +535,11 @@ public void testReingestAspectsGetLatestAspects() throws Exception {
 
     initialChangeLog.setAspect(aspect);
     initialChangeLog.setSystemMetadata(metadata1);
+    initialChangeLog.setEntityKeyAspect(
+        GenericRecordUtils.serializeAspect(
+            EntityKeyUtils.convertUrnToEntityKey(
+                entityUrn,
+                _testEntityRegistry.getEntitySpec(entityUrn.getEntityType()).getKeyAspectSpec())));
 
     final MetadataChangeLog restateChangeLog = new MetadataChangeLog();
     restateChangeLog.setEntityType(entityUrn.getEntityType());
@@ -596,6 +602,11 @@ public void testReingestLineageAspect() throws Exception {
 
     initialChangeLog.setAspect(aspect);
     initialChangeLog.setSystemMetadata(metadata1);
+    initialChangeLog.setEntityKeyAspect(
+        GenericRecordUtils.serializeAspect(
+            EntityKeyUtils.convertUrnToEntityKey(
+                entityUrn,
+                _testEntityRegistry.getEntitySpec(entityUrn.getEntityType()).getKeyAspectSpec())));
 
     final MetadataChangeLog restateChangeLog = new MetadataChangeLog();
     restateChangeLog.setEntityType(entityUrn.getEntityType());
@@ -607,6 +618,11 @@ public void testReingestLineageAspect() throws Exception {
     restateChangeLog.setSystemMetadata(metadata1);
     restateChangeLog.setPreviousAspectValue(aspect);
     restateChangeLog.setPreviousSystemMetadata(simulatePullFromDB(metadata1, SystemMetadata.class));
+    restateChangeLog.setEntityKeyAspect(
+        GenericRecordUtils.serializeAspect(
+            EntityKeyUtils.convertUrnToEntityKey(
+                entityUrn,
+                _testEntityRegistry.getEntitySpec(entityUrn.getEntityType()).getKeyAspectSpec())));
 
     Map<String, RecordTemplate> latestAspects =
         _entityServiceImpl.getLatestAspectsForUrn(

From bc9250c9040a80be06e926818e15ac2ac3bbe472 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 15 May 2024 22:30:47 -0700
Subject: [PATCH 10/11] fix(ingest): fix bug in incremental lineage (#10515)

---
 .../api/incremental_lineage_helper.py         | 20 +++--
 .../test_incremental_lineage_helper.py        | 39 ++++++++++
 ...test_incremental_lineage_pass_through.json | 73 +++++++++++++++++++
 3 files changed, 125 insertions(+), 7 deletions(-)
 create mode 100644 metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_pass_through.json

diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py
index f7ec22e1ec9c6..29e1f63dd452e 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py
@@ -4,6 +4,7 @@
 
 from datahub.configuration.common import ConfigModel
 from datahub.emitter.mce_builder import datahub_guid, set_aspect
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.schema_classes import (
     ChartInfoClass,
@@ -105,21 +106,26 @@ def auto_incremental_lineage(
     for wu in stream:
         urn = wu.get_urn()
 
-        lineage_aspect: Optional[UpstreamLineageClass] = wu.get_aspect_of_type(
-            UpstreamLineageClass
-        )
         if isinstance(wu.metadata, MetadataChangeEventClass):
-            set_aspect(
-                wu.metadata, None, UpstreamLineageClass
-            )  # we'll handle upstreamLineage separately below
+            lineage_aspect = wu.get_aspect_of_type(UpstreamLineageClass)
+            set_aspect(wu.metadata, None, UpstreamLineageClass)
             if len(wu.metadata.proposedSnapshot.aspects) > 0:
                 yield wu
 
-        if lineage_aspect:
+            if lineage_aspect and lineage_aspect.upstreams:
+                yield convert_upstream_lineage_to_patch(
+                    urn, lineage_aspect, wu.metadata.systemMetadata
+                )
+        elif isinstance(wu.metadata, MetadataChangeProposalWrapper) and isinstance(
+            wu.metadata.aspect, UpstreamLineageClass
+        ):
+            lineage_aspect = wu.metadata.aspect
             if lineage_aspect.upstreams:
                 yield convert_upstream_lineage_to_patch(
                     urn, lineage_aspect, wu.metadata.systemMetadata
                 )
+        else:
+            yield wu
 
 
 class IncrementalLineageConfigMixin(ConfigModel):
diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py
index 0f98054ab1d38..2d43b24e10763 100644
--- a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py
+++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py
@@ -1,9 +1,11 @@
 from typing import List
 
+import datahub.emitter.mce_builder as builder
 import datahub.metadata.schema_classes as models
 from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
+from datahub.ingestion.api.source_helpers import auto_workunit
 from datahub.ingestion.sink.file import write_metadata_file
 from tests.test_helpers import mce_helpers
 
@@ -143,3 +145,40 @@ def test_incremental_column_lineage(tmp_path, pytestconfig):
     mce_helpers.check_golden_file(
         pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file
     )
+
+
+def test_incremental_lineage_pass_through(tmp_path, pytestconfig):
+    test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers"
+    test_file = tmp_path / "test_incremental_lineage_pass_through.json"
+    golden_file = test_resources_dir / "test_incremental_lineage_pass_through.json"
+
+    urn = builder.make_dataset_urn("bigquery", "downstream")
+    dataset_mce = builder.make_lineage_mce(
+        [
+            builder.make_dataset_urn("bigquery", "upstream1"),
+            builder.make_dataset_urn("bigquery", "upstream2"),
+        ],
+        urn,
+    )
+    props = models.DatasetPropertiesClass(name="downstream")
+    assert isinstance(dataset_mce.proposedSnapshot, models.DatasetSnapshotClass)
+    dataset_mce.proposedSnapshot.aspects.append(props)
+
+    ownership = MetadataChangeProposalWrapper(
+        entityUrn=urn,
+        aspect=models.OwnershipClass(owners=[]),
+        systemMetadata=system_metadata,
+    )
+
+    processed_wus = auto_incremental_lineage(
+        incremental_lineage=True,
+        stream=auto_workunit([dataset_mce, ownership]),
+    )
+
+    write_metadata_file(
+        test_file,
+        [wu.metadata for wu in processed_wus],
+    )
+    mce_helpers.check_golden_file(
+        pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file
+    )
diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_pass_through.json b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_pass_through.json
new file mode 100644
index 0000000000000..61913a7b7a91a
--- /dev/null
+++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_pass_through.json
@@ -0,0 +1,73 @@
+[
+{
+    "proposedSnapshot": {
+        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)",
+            "aspects": [
+                {
+                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
+                        "customProperties": {},
+                        "name": "downstream",
+                        "tags": []
+                    }
+                }
+            ]
+        }
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)",
+    "changeType": "PATCH",
+    "aspectName": "upstreamLineage",
+    "aspect": {
+        "json": [
+            {
+                "op": "add",
+                "path": "/upstreams/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream1,PROD)",
+                "value": {
+                    "auditStamp": {
+                        "time": 0,
+                        "actor": "urn:li:corpuser:unknown"
+                    },
+                    "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream1,PROD)",
+                    "type": "TRANSFORMED"
+                }
+            },
+            {
+                "op": "add",
+                "path": "/upstreams/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream2,PROD)",
+                "value": {
+                    "auditStamp": {
+                        "time": 0,
+                        "actor": "urn:li:corpuser:unknown"
+                    },
+                    "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream2,PROD)",
+                    "type": "TRANSFORMED"
+                }
+            }
+        ]
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "ownership",
+    "aspect": {
+        "json": {
+            "owners": [],
+            "ownerTypes": {},
+            "lastModified": {
+                "time": 0,
+                "actor": "urn:li:corpuser:unknown"
+            }
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "run-id",
+        "lastRunId": "no-run-id-provided"
+    }
+}
+]
\ No newline at end of file

From 3d5735cbc5899bf294ab3858a78cb9094dea3136 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 15 May 2024 22:31:05 -0700
Subject: [PATCH 11/11] chore(ingest): run pyupgrade for python 3.8 (#10513)

---
 .../datahub/api/entities/datajob/dataflow.py  |  6 +-
 .../datahub/api/entities/datajob/datajob.py   |  6 +-
 .../api/entities/dataproduct/dataproduct.py   |  6 +-
 .../datahub/api/entities/dataset/dataset.py   |  5 +-
 .../src/datahub/api/entities/forms/forms.py   |  6 +-
 .../structuredproperties.py                   |  2 +-
 .../src/datahub/cli/config_utils.py           |  2 +-
 .../src/datahub/cli/docker_check.py           |  2 +-
 .../src/datahub/cli/docker_cli.py             | 12 +--
 .../src/datahub/cli/ingest_cli.py             |  2 +-
 .../src/datahub/cli/quickstart_versioning.py  |  4 +-
 .../src/datahub/configuration/common.py       | 15 ++-
 .../src/datahub/configuration/git.py          |  2 +-
 .../datahub/configuration/source_common.py    |  6 +-
 .../src/datahub/emitter/request_helper.py     |  2 +-
 .../src/datahub/ingestion/api/report.py       |  4 +-
 .../ingestion/extractor/json_ref_patch.py     |  2 +-
 .../ingestion/extractor/schema_util.py        |  3 +-
 .../src/datahub/ingestion/graph/client.py     |  9 +-
 .../src/datahub/ingestion/source/aws/glue.py  |  2 +-
 .../aws/sagemaker_processors/job_classes.py   |  4 +-
 .../ingestion/source/bigquery_v2/bigquery.py  |  4 +-
 .../ingestion/source/bigquery_v2/lineage.py   | 28 +++---
 .../datahub/ingestion/source/csv_enricher.py  | 22 ++---
 .../data_lake_common/data_lake_utils.py       |  3 +-
 .../ingestion/source/dbt/dbt_common.py        |  4 +-
 .../datahub/ingestion/source/dbt/dbt_core.py  |  2 +-
 .../ingestion/source/delta_lake/source.py     |  5 +-
 .../src/datahub/ingestion/source/file.py      |  2 +-
 .../ingestion/source/fivetran/fivetran.py     |  2 +-
 .../source/fivetran/fivetran_log_api.py       |  2 +-
 .../ingestion/source/identity/azure_ad.py     |  4 +-
 .../datahub/ingestion/source/identity/okta.py | 11 +--
 .../datahub/ingestion/source/kafka_connect.py | 10 +-
 .../src/datahub/ingestion/source/ldap.py      |  2 +-
 .../ingestion/source/looker/lkml_patched.py   |  2 +-
 .../ingestion/source/looker/looker_config.py  |  3 +-
 .../ingestion/source/looker/looker_source.py  |  7 +-
 .../ingestion/source/looker/looker_usage.py   |  6 +-
 .../ingestion/source/looker/lookml_source.py  |  2 +-
 .../source/metadata/business_glossary.py      |  5 +-
 .../src/datahub/ingestion/source/mongodb.py   |  2 +-
 .../source/powerbi/m_query/resolver.py        |  2 +-
 .../ingestion/source/powerbi/powerbi.py       |  6 +-
 .../powerbi/rest_api_wrapper/data_classes.py  |  2 +-
 .../powerbi/rest_api_wrapper/data_resolver.py | 12 +--
 .../powerbi_report_server/report_server.py    | 10 +-
 .../report_server_domain.py                   |  8 +-
 .../src/datahub/ingestion/source/pulsar.py    |  4 +-
 .../ingestion/source/qlik_sense/qlik_api.py   |  2 +-
 .../ingestion/source/qlik_sense/qlik_sense.py |  2 +-
 .../ingestion/source/redshift/lineage_v2.py   |  4 +-
 .../src/datahub/ingestion/source/s3/config.py |  4 +-
 .../datahub/ingestion/source/salesforce.py    | 10 +-
 .../ingestion/source/schema/json_schema.py    |  4 +-
 .../ingestion/source/sigma/sigma_api.py       |  2 +-
 .../source/snowflake/snowflake_usage_v2.py    |  2 +-
 .../source/snowflake/snowflake_utils.py       |  2 +-
 .../source/snowflake/snowflake_v2.py          |  2 +-
 .../ingestion/source/sql/clickhouse.py        |  4 +-
 .../src/datahub/ingestion/source/sql/hive.py  |  6 +-
 .../datahub/ingestion/source/sql/oracle.py    |  8 +-
 .../ingestion/source/sql/sql_common.py        |  2 +-
 .../datahub/ingestion/source/sql/sql_utils.py |  3 +-
 .../src/datahub/ingestion/source/sql/trino.py |  4 +-
 .../datahub/ingestion/source/sql/vertica.py   |  2 +-
 .../src/datahub/ingestion/source/tableau.py   |  3 +-
 .../source/unity/hive_metastore_proxy.py      |  2 +-
 .../datahub/ingestion/source/unity/source.py  |  2 +-
 .../transformer/pattern_cleanup_ownership.py  |  6 +-
 .../integrations/great_expectations/action.py | 16 +---
 .../datahub/sql_parsing/schema_resolver.py    |  2 +-
 .../datahub/sql_parsing/sqlglot_lineage.py    |  4 +-
 .../src/datahub/telemetry/telemetry.py        |  6 +-
 .../utilities/file_backed_collections.py      |  3 +-
 .../datahub/utilities/hive_schema_to_avro.py  | 12 +--
 .../integration/bigquery_v2/test_bigquery.py  |  2 +-
 .../tests/integration/dbt/test_dbt.py         |  6 +-
 .../tests/integration/git/test_git_clone.py   | 22 ++---
 .../tests/integration/iceberg/test_iceberg.py |  4 +-
 .../kafka-connect/test_kafka_connect.py       |  8 +-
 .../integration/kafka/test_kafka_state.py     |  8 +-
 .../tests/integration/lookml/test_lookml.py   |  2 +-
 .../integration/metabase/test_metabase.py     |  2 +-
 .../tests/integration/mode/test_mode.py       |  2 +-
 .../tests/integration/oracle/common.py        |  2 +-
 .../tests/integration/snowflake/common.py     | 46 ++++-----
 .../snowflake/test_snowflake_failures.py      |  2 +-
 .../integration/sql_server/test_sql_server.py |  4 +-
 .../tests/test_helpers/mce_helpers.py         | 96 ++++++++-----------
 .../api/source_helpers/test_source_helpers.py |  2 +-
 .../tests/unit/config/test_config_loader.py   | 32 +++----
 .../state/test_stateful_ingestion.py          |  2 +-
 .../tests/unit/test_pipeline.py               |  2 +-
 .../tests/unit/test_snowflake_shares.py       |  2 +-
 .../test_advanced_thread_executor.py          |  6 +-
 .../tests/unit/utilities/test_ratelimiter.py  |  2 +-
 97 files changed, 297 insertions(+), 350 deletions(-)

diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py
index acd708ee81a5c..cb2c536bbab20 100644
--- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py
+++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py
@@ -80,9 +80,9 @@ def __post_init__(self):
         )
 
     def generate_ownership_aspect(self):
-        owners = set([builder.make_user_urn(owner) for owner in self.owners]) | set(
-            [builder.make_group_urn(owner) for owner in self.group_owners]
-        )
+        owners = {builder.make_user_urn(owner) for owner in self.owners} | {
+            builder.make_group_urn(owner) for owner in self.group_owners
+        }
         ownership = OwnershipClass(
             owners=[
                 OwnerClass(
diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py
index 0ad786d68643d..69cbcc4c3e45b 100644
--- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py
+++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py
@@ -70,9 +70,9 @@ def __post_init__(self):
         )
 
     def generate_ownership_aspect(self) -> Iterable[OwnershipClass]:
-        owners = set([builder.make_user_urn(owner) for owner in self.owners]) | set(
-            [builder.make_group_urn(owner) for owner in self.group_owners]
-        )
+        owners = {builder.make_user_urn(owner) for owner in self.owners} | {
+            builder.make_group_urn(owner) for owner in self.group_owners
+        }
         ownership = OwnershipClass(
             owners=[
                 OwnerClass(
diff --git a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py
index 61bda90447c62..408d6bc7256c6 100644
--- a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py
+++ b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py
@@ -276,7 +276,7 @@ def from_yaml(
         cls,
         file: Path,
         graph: DataHubGraph,
-    ) -> "DataProduct":
+    ) -> DataProduct:
         with open(file) as fp:
             yaml = YAML(typ="rt")  # default, if not specfied, is 'rt' (round-trip)
             orig_dictionary = yaml.load(fp)
@@ -291,7 +291,7 @@ def from_yaml(
             return parsed_data_product
 
     @classmethod
-    def from_datahub(cls, graph: DataHubGraph, id: str) -> "DataProduct":
+    def from_datahub(cls, graph: DataHubGraph, id: str) -> DataProduct:
         data_product_properties: Optional[
             DataProductPropertiesClass
         ] = graph.get_aspect(id, DataProductPropertiesClass)
@@ -384,7 +384,7 @@ def _patch_ownership(
                         patches_drop[i] = o
 
         # Figure out what if any are new owners to add
-        new_owners_to_add = set(o for o in new_owner_type_map) - set(owners_matched)
+        new_owners_to_add = {o for o in new_owner_type_map} - set(owners_matched)
         if new_owners_to_add:
             for new_owner in new_owners_to_add:
                 new_owner_type = new_owner_type_map[new_owner]
diff --git a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py
index 4e74a410b5f64..c71bced38f8aa 100644
--- a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py
+++ b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py
@@ -242,7 +242,7 @@ def generate_mcp(
 
         if self.schema_metadata:
             if self.schema_metadata.file:
-                with open(self.schema_metadata.file, "r") as schema_fp:
+                with open(self.schema_metadata.file) as schema_fp:
                     schema_string = schema_fp.read()
                     schema_metadata = SchemaMetadataClass(
                         schemaName=self.name or self.id or self.urn or "",
@@ -377,8 +377,7 @@ def generate_mcp(
                         type="COPY",
                     )
                 )
-                for patch_event in patch_builder.build():
-                    yield patch_event
+                yield from patch_builder.build()
 
         logger.info(f"Created dataset {self.urn}")
 
diff --git a/metadata-ingestion/src/datahub/api/entities/forms/forms.py b/metadata-ingestion/src/datahub/api/entities/forms/forms.py
index fd260e3171ed8..5ac08b6e64ed4 100644
--- a/metadata-ingestion/src/datahub/api/entities/forms/forms.py
+++ b/metadata-ingestion/src/datahub/api/entities/forms/forms.py
@@ -106,7 +106,7 @@ def create(file: str) -> None:
         emitter: DataHubGraph
 
         with get_default_graph() as emitter:
-            with open(file, "r") as fp:
+            with open(file) as fp:
                 forms: List[dict] = yaml.safe_load(fp)
                 for form_raw in forms:
                     form = Forms.parse_obj(form_raw)
@@ -204,7 +204,7 @@ def validate_prompts(self, emitter: DataHubGraph) -> List[FormPromptClass]:
     def upload_entities_for_form(self, emitter: DataHubGraph) -> Union[None, Exception]:
         if self.entities and self.entities.urns:
             formatted_entity_urns = ", ".join(
-                ['"{}"'.format(value) for value in self.entities.urns]
+                [f'"{value}"' for value in self.entities.urns]
             )
             query = UPLOAD_ENTITIES_FOR_FORMS.format(
                 form_urn=self.urn, entity_urns=formatted_entity_urns
@@ -281,7 +281,7 @@ def add_owners(self, emitter: DataHubGraph) -> Union[None, Exception]:
 
     @staticmethod
     def format_form_filter(field: str, urns: List[str]) -> str:
-        formatted_urns = ", ".join(['"{}"'.format(urn) for urn in urns])
+        formatted_urns = ", ".join([f'"{urn}"' for urn in urns])
         return FIELD_FILTER_TEMPLATE.format(field=field, values=formatted_urns)
 
     @staticmethod
diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
index eacbff4b31d93..ed97948de9034 100644
--- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
+++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
@@ -98,7 +98,7 @@ def create(file: str) -> None:
         emitter: DataHubGraph
 
         with get_default_graph() as emitter:
-            with open(file, "r") as fp:
+            with open(file) as fp:
                 structuredproperties: List[dict] = yaml.safe_load(fp)
                 for structuredproperty_raw in structuredproperties:
                     structuredproperty = StructuredProperties.parse_obj(
diff --git a/metadata-ingestion/src/datahub/cli/config_utils.py b/metadata-ingestion/src/datahub/cli/config_utils.py
index 7877a6bf6df59..8cddc41551038 100644
--- a/metadata-ingestion/src/datahub/cli/config_utils.py
+++ b/metadata-ingestion/src/datahub/cli/config_utils.py
@@ -84,7 +84,7 @@ def ensure_datahub_config() -> None:
 
 
 def get_client_config(as_dict: bool = False) -> Union[Optional[DatahubConfig], dict]:
-    with open(DATAHUB_CONFIG_PATH, "r") as stream:
+    with open(DATAHUB_CONFIG_PATH) as stream:
         try:
             config_json = yaml.safe_load(stream)
             if as_dict:
diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py
index b80c2f3df01da..ff3965455d163 100644
--- a/metadata-ingestion/src/datahub/cli/docker_check.py
+++ b/metadata-ingestion/src/datahub/cli/docker_check.py
@@ -203,7 +203,7 @@ def check_docker_quickstart() -> QuickstartStatus:
 
         all_containers = set()
         for config_file in config_files:
-            with open(config_file, "r") as config_file:
+            with open(config_file) as config_file:
                 all_containers.update(
                     yaml.safe_load(config_file).get("services", {}).keys()
                 )
diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py
index e35d4a5c93c2d..707a9cab076e6 100644
--- a/metadata-ingestion/src/datahub/cli/docker_cli.py
+++ b/metadata-ingestion/src/datahub/cli/docker_cli.py
@@ -76,7 +76,7 @@ class Architectures(Enum):
     m2 = "m2"
 
 
-@functools.lru_cache()
+@functools.lru_cache
 def _docker_subprocess_env() -> Dict[str, str]:
     # platform.machine() is equivalent to `uname -m`, as per https://stackoverflow.com/a/45124927/5004662
     DOCKER_COMPOSE_PLATFORM: str = "linux/" + platform.machine()
@@ -316,7 +316,7 @@ def _restore(
         assert os.path.exists(
             resolved_restore_file
         ), f"File {resolved_restore_file} does not exist"
-        with open(resolved_restore_file, "r") as fp:
+        with open(resolved_restore_file) as fp:
             result = subprocess.run(
                 [
                     "bash",
@@ -324,8 +324,7 @@ def _restore(
                     f"docker exec -i {DOCKER_COMPOSE_PROJECT_NAME}-mysql-1 bash -c 'mysql -uroot -pdatahub datahub '",
                 ],
                 stdin=fp,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                capture_output=True,
             )
         if result.returncode != 0:
             logger.error("Failed to run MySQL restore")
@@ -381,7 +380,7 @@ def _restore(
             )
             env_fp.flush()
             if logger.isEnabledFor(logging.DEBUG):
-                with open(env_fp.name, "r") as env_fp_reader:
+                with open(env_fp.name) as env_fp_reader:
                     logger.debug(f"Env file contents: {env_fp_reader.read()}")
 
             # continue to issue the restore indices command
@@ -401,8 +400,7 @@ def _restore(
                     + "acryldata/datahub-upgrade:${DATAHUB_VERSION:-head}"
                     + " -u RestoreIndices -a clean",
                 ],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                capture_output=True,
             )
             logger.info(
                 f"Index restore command finished with status {result.returncode}"
diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py
index 2e66b18e48145..453f1d2934372 100644
--- a/metadata-ingestion/src/datahub/cli/ingest_cli.py
+++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py
@@ -588,6 +588,6 @@ def rollback(
                 for row in unsafe_entities:
                     writer.writerow([row.get("urn")])
 
-        except IOError as e:
+        except OSError as e:
             logger.exception(f"Unable to save rollback failure report: {e}")
             sys.exit(f"Unable to write reports to {report_dir}")
diff --git a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py
index 493869ac77bb8..9739af5127f4d 100644
--- a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py
+++ b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py
@@ -55,7 +55,7 @@ def fetch_quickstart_config(cls) -> "QuickstartVersionMappingConfig":
                 "LOCAL_QUICKSTART_MAPPING_FILE is set, will try to read from local file."
             )
             path = os.path.expanduser(LOCAL_QUICKSTART_MAPPING_FILE)
-            with open(path, "r") as f:
+            with open(path) as f:
                 config_raw = yaml.safe_load(f)
             return cls.parse_obj(config_raw)
 
@@ -70,7 +70,7 @@ def fetch_quickstart_config(cls) -> "QuickstartVersionMappingConfig":
             )
             try:
                 path = os.path.expanduser(DEFAULT_LOCAL_CONFIG_PATH)
-                with open(path, "r") as f:
+                with open(path) as f:
                     config_raw = yaml.safe_load(f)
             except Exception:
                 logger.debug("Couldn't read from local file either.")
diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py
index 7aaa1706a6420..a5971258bcdaa 100644
--- a/metadata-ingestion/src/datahub/configuration/common.py
+++ b/metadata-ingestion/src/datahub/configuration/common.py
@@ -2,13 +2,24 @@
 import unittest.mock
 from abc import ABC, abstractmethod
 from enum import auto
-from typing import IO, Any, ClassVar, Dict, List, Optional, Type, TypeVar, Union
+from typing import (
+    IO,
+    Any,
+    ClassVar,
+    Dict,
+    List,
+    Optional,
+    Type,
+    TypeVar,
+    Union,
+    runtime_checkable,
+)
 
 import pydantic
 from cached_property import cached_property
 from pydantic import BaseModel, Extra, ValidationError
 from pydantic.fields import Field
-from typing_extensions import Protocol, runtime_checkable
+from typing_extensions import Protocol
 
 from datahub.configuration._config_enum import ConfigEnum
 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py
index 3c76c8da0d571..d237cd9ddd306 100644
--- a/metadata-ingestion/src/datahub/configuration/git.py
+++ b/metadata-ingestion/src/datahub/configuration/git.py
@@ -101,7 +101,7 @@ def deploy_key_filled_from_deploy_key_file(
         if v is None:
             deploy_key_file = values.get("deploy_key_file")
             if deploy_key_file is not None:
-                with open(deploy_key_file, "r") as fp:
+                with open(deploy_key_file) as fp:
                     deploy_key = SecretStr(fp.read())
                     return deploy_key
         return v
diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py
index 4b982db2715c2..a792201f9defe 100644
--- a/metadata-ingestion/src/datahub/configuration/source_common.py
+++ b/metadata-ingestion/src/datahub/configuration/source_common.py
@@ -10,9 +10,9 @@
 DEFAULT_ENV = FabricTypeClass.PROD
 
 # Get all the constants from the FabricTypeClass. It's not an enum, so this is a bit hacky but works.
-ALL_ENV_TYPES: Set[str] = set(
-    [value for name, value in vars(FabricTypeClass).items() if not name.startswith("_")]
-)
+ALL_ENV_TYPES: Set[str] = {
+    value for name, value in vars(FabricTypeClass).items() if not name.startswith("_")
+}
 
 
 class PlatformInstanceConfigMixin(ConfigModel):
diff --git a/metadata-ingestion/src/datahub/emitter/request_helper.py b/metadata-ingestion/src/datahub/emitter/request_helper.py
index 5263ba1912592..4e1ec026648b8 100644
--- a/metadata-ingestion/src/datahub/emitter/request_helper.py
+++ b/metadata-ingestion/src/datahub/emitter/request_helper.py
@@ -25,4 +25,4 @@ def make_curl_command(
         ),
         url,
     ]
-    return " ".join(shlex.quote(fragment) for fragment in fragments)
+    return shlex.join(fragments)
diff --git a/metadata-ingestion/src/datahub/ingestion/api/report.py b/metadata-ingestion/src/datahub/ingestion/api/report.py
index 08b20d9e85691..4a74d6cbc6268 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/report.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/report.py
@@ -5,12 +5,12 @@
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Optional
+from typing import Any, Optional, runtime_checkable
 
 import humanfriendly
 import pydantic
 from pydantic import BaseModel
-from typing_extensions import Literal, Protocol, runtime_checkable
+from typing_extensions import Literal, Protocol
 
 from datahub.ingestion.api.report_helpers import format_datetime_relative
 from datahub.utilities.lossy_collections import LossyList
diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_ref_patch.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_ref_patch.py
index daf43bd87ba60..2224a096f5387 100644
--- a/metadata-ingestion/src/datahub/ingestion/extractor/json_ref_patch.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_ref_patch.py
@@ -15,7 +15,7 @@ def title_swapping_callback(self: JsonRef) -> dict:
         try:
             base_doc = self.loader(uri)
         except Exception as e:
-            raise self._error("%s: %s" % (e.__class__.__name__, str(e)), cause=e) from e
+            raise self._error(f"{e.__class__.__name__}: {str(e)}", cause=e) from e
         base_doc = _replace_refs(
             base_doc, **{**self._ref_kwargs, "base_uri": uri, "recursing": False}
         )
diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
index df0b732833fbe..d5af4f7a2389c 100644
--- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
@@ -436,8 +436,7 @@ def gen_items_from_list_tuple_or_scalar(
             val: Any,
         ) -> Iterable[avro.schema.Schema]:
             if isinstance(val, (list, tuple)):
-                for i in val:
-                    yield i
+                yield from val
             else:
                 yield val
 
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index c6b2c8aad82e9..be3aa2e80780a 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -324,6 +324,7 @@ def get_ownership(self, entity_urn: str) -> Optional[OwnershipClass]:
     def get_schema_metadata(self, entity_urn: str) -> Optional[SchemaMetadataClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=SchemaMetadataClass)
 
+    @deprecated(reason="Use get_aspect directly.")
     def get_domain_properties(self, entity_urn: str) -> Optional[DomainPropertiesClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainPropertiesClass)
 
@@ -343,11 +344,9 @@ def get_glossary_terms(self, entity_urn: str) -> Optional[GlossaryTermsClass]:
     def get_domain(self, entity_urn: str) -> Optional[DomainsClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass)
 
+    @deprecated(reason="Use get_aspect directly.")
     def get_browse_path(self, entity_urn: str) -> Optional[BrowsePathsClass]:
-        return self.get_aspect(
-            entity_urn=entity_urn,
-            aspect_type=BrowsePathsClass,
-        )
+        return self.get_aspect(entity_urn=entity_urn, aspect_type=BrowsePathsClass)
 
     def get_usage_aspects_from_urn(
         self, entity_urn: str, start_timestamp: int, end_timestamp: int
@@ -1095,7 +1094,7 @@ def delete_references_to_urn(
         related_aspects = response.get("relatedAspects", [])
         return reference_count, related_aspects
 
-    @functools.lru_cache()
+    @functools.lru_cache
     def _make_schema_resolver(
         self,
         platform: str,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index 0ac13b256eb03..062ab381d40b7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -1009,7 +1009,7 @@ def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
         # in Glue, it's possible for two buckets to have files of different extensions
         # if this happens, we append the extension in the URN so the sources can be distinguished
         # see process_dataflow_node() for details
-        s3_formats: DefaultDict[str, Set[Optional[str]]] = defaultdict(lambda: set())
+        s3_formats: DefaultDict[str, Set[Optional[str]]] = defaultdict(set)
         for dag in dags.values():
             if dag is not None:
                 for s3_name, extension in self.get_dataflow_s3_names(dag):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/job_classes.py b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/job_classes.py
index 442c5eb2e0a8f..6e0e352db4af7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/job_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/job_classes.py
@@ -1,6 +1,4 @@
-from typing import Dict
-
-from typing_extensions import Final
+from typing import Dict, Final
 
 from datahub.metadata.schema_classes import JobStatusClass
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 27ad2008dae00..eecc0f4372969 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -223,7 +223,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
     }
 
     def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
-        super(BigqueryV2Source, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config: BigQueryV2Config = config
         self.report: BigQueryV2Report = BigQueryV2Report()
         self.classification_handler = ClassificationHandler(self.config, self.report)
@@ -340,7 +340,7 @@ def metadata_read_capability_test(
     ) -> CapabilityReport:
         for project_id in project_ids:
             try:
-                logger.info((f"Metadata read capability test for project {project_id}"))
+                logger.info(f"Metadata read capability test for project {project_id}")
                 client: bigquery.Client = config.get_bigquery_client()
                 assert client
                 bigquery_data_dictionary = BigQuerySchemaApi(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index c8c1e7c893c6c..c41207ec67f62 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -551,22 +551,20 @@ def lineage_via_catalog_lineage_api(
                 # Only builds lineage map when the table has upstreams
                 logger.debug("Found %d upstreams for table %s", len(upstreams), table)
                 if upstreams:
-                    lineage_map[destination_table_str] = set(
-                        [
-                            LineageEdge(
-                                table=str(
-                                    BigQueryTableRef(
-                                        table_identifier=BigqueryTableIdentifier.from_string_name(
-                                            source_table
-                                        )
+                    lineage_map[destination_table_str] = {
+                        LineageEdge(
+                            table=str(
+                                BigQueryTableRef(
+                                    table_identifier=BigqueryTableIdentifier.from_string_name(
+                                        source_table
                                     )
-                                ),
-                                column_mapping=frozenset(),
-                                auditStamp=curr_date,
-                            )
-                            for source_table in upstreams
-                        ]
-                    )
+                                )
+                            ),
+                            column_mapping=frozenset(),
+                            auditStamp=curr_date,
+                        )
+                        for source_table in upstreams
+                    }
             return lineage_map
         except Exception as e:
             self.error(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
index ec3d1715aaece..d998c37d32ed2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
@@ -154,9 +154,7 @@ def get_resource_glossary_terms_work_unit(
             # If we want to overwrite or there are no existing terms, create a new GlossaryTerms object
             current_terms = GlossaryTermsClass(term_associations, get_audit_stamp())
         else:
-            current_term_urns: Set[str] = set(
-                [term.urn for term in current_terms.terms]
-            )
+            current_term_urns: Set[str] = {term.urn for term in current_terms.terms}
             term_associations_filtered: List[GlossaryTermAssociationClass] = [
                 association
                 for association in term_associations
@@ -192,7 +190,7 @@ def get_resource_tags_work_unit(
             # If we want to overwrite or there are no existing tags, create a new GlobalTags object
             current_tags = GlobalTagsClass(tag_associations)
         else:
-            current_tag_urns: Set[str] = set([tag.tag for tag in current_tags.tags])
+            current_tag_urns: Set[str] = {tag.tag for tag in current_tags.tags}
             tag_associations_filtered: List[TagAssociationClass] = [
                 association
                 for association in tag_associations
@@ -453,9 +451,9 @@ def process_sub_resource_row(
                 field_match = True
                 if has_terms:
                     if field_info.glossaryTerms and not self.should_overwrite:
-                        current_term_urns = set(
-                            [term.urn for term in field_info.glossaryTerms.terms]
-                        )
+                        current_term_urns = {
+                            term.urn for term in field_info.glossaryTerms.terms
+                        }
                         term_associations_filtered = [
                             association
                             for association in term_associations
@@ -472,9 +470,9 @@ def process_sub_resource_row(
 
                 if has_tags:
                     if field_info.globalTags and not self.should_overwrite:
-                        current_tag_urns = set(
-                            [tag.tag for tag in field_info.globalTags.tags]
-                        )
+                        current_tag_urns = {
+                            tag.tag for tag in field_info.globalTags.tags
+                        }
                         tag_associations_filtered = [
                             association
                             for association in tag_associations
@@ -631,9 +629,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                     f"Cannot read remote file {self.config.filename}, error:{e}"
                 )
         else:
-            with open(
-                pathlib.Path(self.config.filename), mode="r", encoding="utf-8-sig"
-            ) as f:
+            with open(pathlib.Path(self.config.filename), encoding="utf-8-sig") as f:
                 rows = list(csv.DictReader(f, delimiter=self.config.delimiter))
 
         for row in rows:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py
index b04718a9eabba..5393dd4835d8c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py
@@ -58,8 +58,7 @@ def create_emit_containers(
             )
             self.processed_containers.append(container_key.guid())
             logger.debug(f"Creating container with key: {container_key}")
-            for wu in container_wus:
-                yield wu
+            yield from container_wus
 
     def gen_folder_key(self, abs_path):
         return FolderKey(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index d7a3aba7065ca..ebba664a811c7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1853,7 +1853,7 @@ def get_transformed_tags_by_prefix(
         entity_urn: str,
         tags_prefix_filter: str,
     ) -> List[TagAssociationClass]:
-        tag_set = set([new_tag.tag for new_tag in new_tags])
+        tag_set = {new_tag.tag for new_tag in new_tags}
 
         if self.ctx.graph:
             existing_tags_class = self.ctx.graph.get_tags(entity_urn)
@@ -1868,7 +1868,7 @@ def get_transformed_tags_by_prefix(
     def get_transformed_terms(
         self, new_terms: List[GlossaryTermAssociation], entity_urn: str
     ) -> List[GlossaryTermAssociation]:
-        term_id_set = set([term.urn for term in new_terms])
+        term_id_set = {term.urn for term in new_terms}
         if self.ctx.graph:
             existing_terms_class = self.ctx.graph.get_glossary_terms(entity_urn)
             if existing_terms_class and existing_terms_class.terms:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
index 0fc35ddd281c8..750fee227b97a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
@@ -481,7 +481,7 @@ def load_file_as_json(
             )
             return json.loads(response["Body"].read().decode("utf-8"))
         else:
-            with open(uri, "r") as f:
+            with open(uri) as f:
                 return json.load(f)
 
     def loadManifestAndCatalog(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py
index 39066b0c26553..4f427aa203c20 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py
@@ -151,7 +151,7 @@ def delta_type_to_hive_type(self, field_type: Any) -> str:
                     and create the native datatype
                     """
                     parsed_struct += (
-                        "{0}:{1}".format(
+                        "{}:{}".format(
                             field.get("name"),
                             self.delta_type_to_hive_type(field.get("type")),
                         )
@@ -343,8 +343,7 @@ def process_folder(self, path: str) -> Iterable[MetadataWorkUnit]:
         delta_table = read_delta_table(path, self.storage_options, self.source_config)
         if delta_table:
             logger.debug(f"Delta table found at: {path}")
-            for wu in self.ingest_table(delta_table, path.rstrip("/")):
-                yield wu
+            yield from self.ingest_table(delta_table, path.rstrip("/"))
         else:
             for folder in self.get_folders(path):
                 yield from self.process_folder(folder)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py
index 590aa59f7b5b6..49cc314426eb5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/file.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/file.py
@@ -256,7 +256,7 @@ def _iterate_file(self, path: str) -> Iterable[Tuple[int, Any]]:
                 file_read_mode = self.config.read_mode
 
             if file_read_mode == FileReadMode.BATCH:
-                with open(path, "r") as f:
+                with open(path) as f:
                     parse_start_time = datetime.datetime.now()
                     obj_list = json.load(f)
                     parse_end_time = datetime.datetime.now()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index 91b0101c10451..c8ae779b602b8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -66,7 +66,7 @@ class FivetranSource(StatefulIngestionSourceBase):
     platform: str = "fivetran"
 
     def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext):
-        super(FivetranSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config = config
         self.report = FivetranSourceReport()
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
index d210941bccba1..a9eb59f929799 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -76,7 +76,7 @@ def _initialize_fivetran_variables(
         )
 
     def _query(self, query: str) -> List[Dict]:
-        logger.debug("Query : {}".format(query))
+        logger.debug(f"Query : {query}")
         resp = self.engine.execute(query)
         return [row for row in resp]
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py
index 7e3ff7d4fb84c..2bd05ca11e234 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py
@@ -263,7 +263,7 @@ def create(cls, config_dict, ctx):
         return cls(config, ctx)
 
     def __init__(self, config: AzureADConfig, ctx: PipelineContext):
-        super(AzureADSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config = config
         self.report = AzureADSourceReport(
             filtered_tracking=self.config.filtered_tracking
@@ -488,7 +488,7 @@ def _get_azure_ad_group_members(self, azure_ad_group: dict) -> Iterable[List]:
         yield from self._get_azure_ad_data(kind=kind)
 
     def _get_azure_ad_data(self, kind: str) -> Iterable[List]:
-        headers = {"Authorization": "Bearer {}".format(self.token)}
+        headers = {"Authorization": f"Bearer {self.token}"}
         #           'ConsistencyLevel': 'eventual'}
         url = self.config.graph_url + kind
         while True:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py
index 5c1edce7da6c9..49b6422902299 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py
@@ -289,7 +289,7 @@ def create(cls, config_dict, ctx):
         return cls(config, ctx)
 
     def __init__(self, config: OktaConfig, ctx: PipelineContext):
-        super(OktaSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config = config
         self.report = OktaSourceReport()
         self.okta_client = self._create_okta_client()
@@ -465,8 +465,7 @@ def _get_okta_groups(
                     "okta_groups", f"Failed to fetch Groups from Okta API: {err}"
                 )
             if groups:
-                for group in groups:
-                    yield group
+                yield from groups
             if resp and resp.has_next():
                 sleep(self.config.delay_seconds)
                 try:
@@ -504,8 +503,7 @@ def _get_okta_group_users(
                     f"Failed to fetch Users of Group {group.profile.name} from Okta API: {err}",
                 )
             if users:
-                for user in users:
-                    yield user
+                yield from users
             if resp and resp.has_next():
                 sleep(self.config.delay_seconds)
                 try:
@@ -542,8 +540,7 @@ def _get_okta_users(self, event_loop: asyncio.AbstractEventLoop) -> Iterable[Use
                     "okta_users", f"Failed to fetch Users from Okta API: {err}"
                 )
             if users:
-                for user in users:
-                    yield user
+                yield from users
             if resp and resp.has_next():
                 sleep(self.config.delay_seconds)
                 try:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
index 4e5ea9154359b..cf70eb95762c4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
@@ -263,12 +263,12 @@ def __init__(
     KNOWN_NONTOPICROUTING_TRANSFORMS = (
         KAFKA_NONTOPICROUTING_TRANSFORMS
         + [
-            "org.apache.kafka.connect.transforms.{}".format(t)
+            f"org.apache.kafka.connect.transforms.{t}"
             for t in KAFKA_NONTOPICROUTING_TRANSFORMS
         ]
         + CONFLUENT_NONTOPICROUTING_TRANSFORMS
         + [
-            "io.confluent.connect.transforms.{}".format(t)
+            f"io.confluent.connect.transforms.{t}"
             for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
         ]
     )
@@ -314,9 +314,9 @@ def get_parser(
             transform = {"name": name}
             transforms.append(transform)
             for key in self.connector_manifest.config.keys():
-                if key.startswith("transforms.{}.".format(name)):
+                if key.startswith(f"transforms.{name}."):
                     transform[
-                        key.replace("transforms.{}.".format(name), "")
+                        key.replace(f"transforms.{name}.", "")
                     ] = self.connector_manifest.config[key]
 
         return self.JdbcParser(
@@ -729,7 +729,7 @@ def _extract_lineages(self):
             source_platform = parser.source_platform
             server_name = parser.server_name
             database_name = parser.database_name
-            topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name)
+            topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
 
             if not self.connector_manifest.topic_names:
                 return lineages
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
index 72985688273f6..1368a5b83fe6f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
@@ -205,7 +205,7 @@ class LDAPSource(StatefulIngestionSourceBase):
 
     def __init__(self, ctx: PipelineContext, config: LDAPSourceConfig):
         """Constructor."""
-        super(LDAPSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config = config
 
         # ensure prior defaults are in place
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lkml_patched.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lkml_patched.py
index 6506682b8ed8d..a44d7e5215c35 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lkml_patched.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lkml_patched.py
@@ -24,5 +24,5 @@ def load_lkml(path: Union[str, pathlib.Path]) -> dict:
     # Using this method instead of lkml.load directly ensures
     # that our patches to lkml are applied.
 
-    with open(path, "r") as file:
+    with open(path) as file:
         return lkml.load(file)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
index ec4d8b78b0d06..8de213cfabaf0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
@@ -1,11 +1,10 @@
 import dataclasses
 import os
 import re
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, ClassVar, Dict, List, Optional, Union, cast
 
 import pydantic
 from pydantic import Field, validator
-from typing_extensions import ClassVar
 
 from datahub.configuration import ConfigModel
 from datahub.configuration.common import AllowDenyPattern
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
index dfa374fe0d779..c4ba3146031af 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -644,9 +644,7 @@ def _make_chart_metadata_events(
             customProperties={
                 "upstream_fields": (
                     ",".join(
-                        sorted(
-                            set(field.name for field in dashboard_element.input_fields)
-                        )
+                        sorted({field.name for field in dashboard_element.input_fields})
                     )
                     if dashboard_element.input_fields
                     else ""
@@ -969,8 +967,7 @@ def _make_dashboard_and_chart_mces(
         dashboard_events = self._make_dashboard_metadata_events(
             looker_dashboard, list(chart_urns)
         )
-        for dashboard_event in dashboard_events:
-            yield dashboard_event
+        yield from dashboard_events
 
     def get_ownership(
         self, looker_dashboard: LookerDashboard
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py
index e119e88a24bd7..c97025d75229b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py
@@ -273,7 +273,7 @@ def _fill_user_stat_aspect(
         logger.debug("Entering fill user stat aspect")
 
         # We first resolve all the users using a threadpool to warm up the cache
-        user_ids = set([self._get_user_identifier(row) for row in user_wise_rows])
+        user_ids = {self._get_user_identifier(row) for row in user_wise_rows}
         start_time = datetime.datetime.now()
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=self.config.max_threads
@@ -507,7 +507,7 @@ def append_user_stat(
         user_urn: Optional[str] = user.get_urn(self.config.strip_user_ids_from_email)
 
         if user_urn is None:
-            logger.warning("user_urn not found for the user {}".format(user))
+            logger.warning(f"user_urn not found for the user {user}")
             return
 
         dashboard_stat_aspect.userCounts.append(
@@ -614,7 +614,7 @@ def append_user_stat(
         user_urn: Optional[str] = user.get_urn(self.config.strip_user_ids_from_email)
 
         if user_urn is None:
-            logger.warning("user_urn not found for the user {}".format(user))
+            logger.warning(f"user_urn not found for the user {user}")
             return
 
         chart_stat_aspect.userCounts.append(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
index 9dd276d054de3..4a872f8b1a025 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
@@ -670,7 +670,7 @@ def _load_viewfile(
             return self.viewfile_cache[path]
 
         try:
-            with open(path, "r") as file:
+            with open(path) as file:
                 raw_file_content = file.read()
         except Exception as e:
             self.reporter.report_failure(path, f"failed to load view file: {e}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py
index 0edc8d9752983..d3c4e2e3cd80e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py
@@ -520,12 +520,11 @@ def get_workunits_internal(
         materialize_all_node_urns(glossary_config, self.config.enable_auto_id)
         path_vs_id = populate_path_vs_id(glossary_config)
 
-        for event in auto_workunit(
+        yield from auto_workunit(
             get_mces(
                 glossary_config, path_vs_id, ingestion_config=self.config, ctx=self.ctx
             )
-        ):
-            yield event
+        )
 
     def get_report(self):
         return self.report
diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
index cd78d1c030957..af6b44677dffa 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
@@ -71,7 +71,7 @@
 # See https://docs.mongodb.com/manual/reference/local-database/ and
 # https://docs.mongodb.com/manual/reference/config-database/ and
 # https://stackoverflow.com/a/48273736/5004662.
-DENY_DATABASE_LIST = set(["admin", "config", "local"])
+DENY_DATABASE_LIST = {"admin", "config", "local"}
 
 
 class HostingEnvironment(Enum):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
index 72f9c2167cab9..4d58916c57118 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
@@ -889,7 +889,7 @@ def get_datasource_server(
         return (
             data_access_func_detail.identifier_accessor.items["Name"]
             if data_access_func_detail.identifier_accessor is not None
-            else str()
+            else ""
         )
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
index 607f314342375..16f174525254d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -580,7 +580,7 @@ def tile_custom_properties(tile: powerbi_data_classes.Tile) -> dict:
         )
 
         # Browse path
-        browse_path = BrowsePathsClass(paths=["/powerbi/{}".format(workspace.name)])
+        browse_path = BrowsePathsClass(paths=[f"/powerbi/{workspace.name}"])
         browse_path_mcp = self.new_mcp(
             entity_type=Constant.CHART,
             entity_urn=chart_urn,
@@ -990,7 +990,7 @@ def to_chart_mcps(
             )
 
             # Browse path
-            browse_path = BrowsePathsClass(paths=["/powerbi/{}".format(workspace.name)])
+            browse_path = BrowsePathsClass(paths=[f"/powerbi/{workspace.name}"])
             browse_path_mcp = self.new_mcp(
                 entity_type=Constant.CHART,
                 entity_urn=chart_urn,
@@ -1195,7 +1195,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
     platform: str = "powerbi"
 
     def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext):
-        super(PowerBiDashboardSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.source_config = config
         self.reporter = PowerBiDashboardSourceReport()
         self.dataplatform_instance_resolver = create_dataplatform_instance_resolver(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py
index 0d41ab00c66f5..ce4dd9a7a0c0f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py
@@ -268,7 +268,7 @@ def new_powerbi_dataset(workspace_id: str, raw_instance: dict) -> PowerBIDataset
     return PowerBIDataset(
         id=raw_instance["id"],
         name=raw_instance.get("name"),
-        description=raw_instance.get("description", str()),
+        description=raw_instance.get("description", ""),
         webUrl="{}/details".format(raw_instance.get("webUrl"))
         if raw_instance.get("webUrl") is not None
         else None,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
index 3aeffa60bc28e..fadd7a48b62f7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
@@ -63,7 +63,7 @@ def __init__(
         self.__access_token_expiry_time: Optional[datetime] = None
         self.__tenant_id = tenant_id
         # Test connection by generating access token
-        logger.info("Trying to connect to {}".format(self._get_authority_url()))
+        logger.info(f"Trying to connect to {self._get_authority_url()}")
         # Power-Bi Auth (Service Principal Auth)
         self.__msal_client = msal.ConfidentialClientApplication(
             client_id,
@@ -72,7 +72,7 @@ def __init__(
         )
         self.get_access_token()
 
-        logger.info("Connected to {}".format(self._get_authority_url()))
+        logger.info(f"Connected to {self._get_authority_url()}")
         self._request_session = requests.Session()
         # set re-try parameter for request_session
         self._request_session.mount(
@@ -124,7 +124,7 @@ def get_users(self, workspace_id: str, entity: str, entity_id: str) -> List[User
         pass
 
     def _get_authority_url(self):
-        return "{}{}".format(DataResolverBase.AUTHORITY, self.__tenant_id)
+        return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}"
 
     def get_authorization_header(self):
         return {Constant.Authorization: self.get_access_token()}
@@ -193,7 +193,7 @@ def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
                 id=instance.get(Constant.ID),
                 isReadOnly=instance.get(Constant.IS_READ_ONLY),
                 displayName=instance.get(Constant.DISPLAY_NAME),
-                description=instance.get(Constant.DESCRIPTION, str()),
+                description=instance.get(Constant.DESCRIPTION, ""),
                 embedUrl=instance.get(Constant.EMBED_URL),
                 webUrl=instance.get(Constant.WEB_URL),
                 workspace_id=workspace.id,
@@ -276,7 +276,7 @@ def fetch_reports():
                 name=raw_instance.get(Constant.NAME),
                 webUrl=raw_instance.get(Constant.WEB_URL),
                 embedUrl=raw_instance.get(Constant.EMBED_URL),
-                description=raw_instance.get(Constant.DESCRIPTION, str()),
+                description=raw_instance.get(Constant.DESCRIPTION, ""),
                 pages=self._get_pages_by_report(
                     workspace=workspace, report_id=raw_instance[Constant.ID]
                 ),
@@ -809,7 +809,7 @@ def get_modified_workspaces(self, modified_since: str) -> List[str]:
 
         # Return scan_id of Scan created for the given workspace
         workspace_ids = [row["id"] for row in res.json()]
-        logger.debug("modified workspace_ids: {}".format(workspace_ids))
+        logger.debug(f"modified workspace_ids: {workspace_ids}")
         return workspace_ids
 
     def get_dataset_parameters(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
index b793929faa691..d6c7076d49507 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
@@ -143,7 +143,7 @@ class PowerBiReportServerAPI:
     def __init__(self, config: PowerBiReportServerAPIConfig) -> None:
         self.__config: PowerBiReportServerAPIConfig = config
         self.__auth: HttpNtlmAuth = HttpNtlmAuth(
-            "{}\\{}".format(self.__config.workstation_name, self.__config.username),
+            f"{self.__config.workstation_name}\\{self.__config.username}",
             self.__config.password,
         )
 
@@ -153,14 +153,14 @@ def get_auth_credentials(self):
 
     def requests_get(self, url_http: str, url_https: str, content_type: str) -> Any:
         try:
-            LOGGER.info("Request to Report URL={}".format(url_https))
+            LOGGER.info(f"Request to Report URL={url_https}")
             response = requests.get(
                 url=url_https,
                 auth=self.get_auth_credentials,
                 verify=True,
             )
         except ConnectionError:
-            LOGGER.info("Request to Report URL={}".format(url_http))
+            LOGGER.info(f"Request to Report URL={url_http}")
             response = requests.get(
                 url=url_http,
                 auth=self.get_auth_credentials,
@@ -406,7 +406,7 @@ def to_datahub_user(self, user: CorpUser) -> List[MetadataChangeProposalWrapper]
         """
         user_mcps = []
         if user:
-            LOGGER.info("Converting user {} to datahub's user".format(user.username))
+            LOGGER.info(f"Converting user {user.username} to datahub's user")
 
             # Create an URN for User
             user_urn = builder.make_user_urn(user.get_urn_part())
@@ -449,7 +449,7 @@ def to_datahub_user(self, user: CorpUser) -> List[MetadataChangeProposalWrapper]
     def to_datahub_work_units(self, report: Report) -> List[EquableMetadataWorkUnit]:
         mcps = []
         user_mcps = []
-        LOGGER.info("Converting Dashboard={} to DataHub Dashboard".format(report.name))
+        LOGGER.info(f"Converting Dashboard={report.name} to DataHub Dashboard")
         # Convert user to CorpUser
         user_info = report.user_info.owner_to_add
         if user_info:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py
index ee87d93774b3d..b65ae5cd2994c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py
@@ -39,10 +39,10 @@ def validate_diplay_name(cls, value, values):  # noqa: N805
         return ""
 
     def get_urn_part(self):
-        return "reports.{}".format(self.id)
+        return f"reports.{self.id}"
 
     def get_web_url(self, base_reports_url: str) -> str:
-        return "{}powerbi{}".format(base_reports_url, self.path)
+        return f"{base_reports_url}powerbi{self.path}"
 
     def get_browse_path(
         self, base_folder: str, workspace: str, env: str, report_directory: str
@@ -57,7 +57,7 @@ class DataSet(CatalogItem):
     query_execution_time_out: int = Field(alias="QueryExecutionTimeOut")
 
     def get_urn_part(self):
-        return "datasets.{}".format(self.id)
+        return f"datasets.{self.id}"
 
     def __members(self):
         return (self.id,)
@@ -339,7 +339,7 @@ class CorpUser(BaseModel):
     global_tags: Optional[GlobalTags] = Field(None, alias="globalTags")
 
     def get_urn_part(self):
-        return "{}".format(self.username)
+        return f"{self.username}"
 
     def __members(self):
         return (self.username,)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
index 00a49cd897d6f..7671e23928430 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 
-class PulsarTopic(object):
+class PulsarTopic:
     __slots__ = ["topic_parts", "fullname", "type", "tenant", "namespace", "topic"]
 
     def __init__(self, topic):
@@ -65,7 +65,7 @@ def __init__(self, topic):
         self.topic = topic_parts[5]
 
 
-class PulsarSchema(object):
+class PulsarSchema:
     __slots__ = [
         "schema_version",
         "schema_name",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py
index 66a18873d86df..d7a040ff5f0a0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_api.py
@@ -36,7 +36,7 @@ def __init__(self, config: QlikSourceConfig) -> None:
         )
         self.rest_api_url = f"https://{self.config.tenant_hostname}/api/v1"
         # Test connection by fetching list of api keys
-        logger.info("Trying to connect to {}".format(self.rest_api_url))
+        logger.info(f"Trying to connect to {self.rest_api_url}")
         self.session.get(f"{self.rest_api_url}/api-keys").raise_for_status()
 
     def _log_http_error(self, message: str) -> Any:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_sense.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_sense.py
index a5b9adae0376c..b9fd2a9c4fe22 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_sense.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/qlik_sense.py
@@ -112,7 +112,7 @@ class QlikSenseSource(StatefulIngestionSourceBase, TestableSource):
     platform: str = "qlik-sense"
 
     def __init__(self, config: QlikSourceConfig, ctx: PipelineContext):
-        super(QlikSenseSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config = config
         self.reporter = QlikSourceReport()
         try:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
index 797b309f528cc..2c7ebb613c57a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
@@ -94,7 +94,7 @@ def build(
         db_schemas: Dict[str, Dict[str, RedshiftSchema]],
     ) -> None:
         # Assume things not in `all_tables` as temp tables.
-        self.known_urns = set(
+        self.known_urns = {
             DatasetUrn.create_from_ids(
                 self.platform,
                 f"{db}.{schema}.{table.name}",
@@ -104,7 +104,7 @@ def build(
             for db, schemas in all_tables.items()
             for schema, tables in schemas.items()
             for table in tables
-        )
+        }
         self.aggregator.is_temp_table = lambda urn: urn not in self.known_urns
 
         # Handle all the temp tables up front.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py
index 55e340e2850d5..921ab27564250 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py
@@ -111,9 +111,9 @@ def check_path_specs_and_infer_platform(
             raise ValueError("path_specs must not be empty")
 
         # Check that all path specs have the same platform.
-        guessed_platforms = set(
+        guessed_platforms = {
             "s3" if path_spec.is_s3 else "file" for path_spec in path_specs
-        )
+        }
         if len(guessed_platforms) > 1:
             raise ValueError(
                 f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
index b25f67d6f5ef1..946fdcedc571f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
@@ -353,7 +353,7 @@ def get_custom_object_details(self, sObjectDeveloperName: str) -> dict:
             self.base_url
             + "tooling/query/?q=SELECT Description, Language, ManageableState, "
             + "CreatedDate, CreatedBy.Username, LastModifiedDate, LastModifiedBy.Username "
-            + "FROM CustomObject where DeveloperName='{0}'".format(sObjectDeveloperName)
+            + f"FROM CustomObject where DeveloperName='{sObjectDeveloperName}'"
         )
         custom_objects_response = self.sf._call_salesforce("GET", query_url).json()
         if len(custom_objects_response["records"]) > 0:
@@ -656,7 +656,7 @@ def get_schema_metadata_workunit(
             + "Precision, Scale, Length, Digits ,FieldDefinition.IsIndexed, IsUnique,"
             + "IsCompound, IsComponent, ReferenceTo, FieldDefinition.ComplianceGroup,"
             + "RelationshipName, IsNillable, FieldDefinition.Description, InlineHelpText "
-            + "FROM EntityParticle WHERE EntityDefinitionId='{0}'".format(
+            + "FROM EntityParticle WHERE EntityDefinitionId='{}'".format(
                 sObject["DurableId"]
             )
         )
@@ -665,16 +665,14 @@ def get_schema_metadata_workunit(
             "GET", sObject_fields_query_url
         ).json()
 
-        logger.debug(
-            "Received Salesforce {sObject} fields response".format(sObject=sObjectName)
-        )
+        logger.debug(f"Received Salesforce {sObjectName} fields response")
 
         sObject_custom_fields_query_url = (
             self.base_url
             + "tooling/query?q=SELECT "
             + "DeveloperName,CreatedDate,CreatedBy.Username,InlineHelpText,"
             + "LastModifiedDate,LastModifiedBy.Username "
-            + "FROM CustomField WHERE EntityDefinitionId='{0}'".format(
+            + "FROM CustomField WHERE EntityDefinitionId='{}'".format(
                 sObject["DurableId"]
             )
         )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py
index c7e8a15d8dfa4..635e894d18c7e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py
@@ -212,7 +212,7 @@ def _load_json_schema(filename, loader, use_id_as_base_uri):
         """Loads the given schema file"""
         path = Path(filename).resolve()
         base_path = dirname(str(path))
-        base_uri = "file://{}/".format(base_path)
+        base_uri = f"file://{base_path}/"
 
         with open(path) as schema_file:
             logger.info(f"Opening file {path}")
@@ -243,7 +243,7 @@ def stringreplaceloader(match_string, replace_string, uri, **kwargs):
         return jsonref.jsonloader(uri, **kwargs)
 
     def __init__(self, ctx: PipelineContext, config: JsonSchemaSourceConfig):
-        super(JsonSchemaSource, self).__init__(ctx=ctx, config=config)
+        super().__init__(ctx=ctx, config=config)
         self.config = config
         self.report = StaleEntityRemovalSourceReport()
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py
index c335bee15931d..c2c28419ebcfd 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma_api.py
@@ -24,7 +24,7 @@ def __init__(self, config: SigmaSourceConfig) -> None:
         self.users: Dict[str, str] = {}
         self.session = requests.Session()
         # Test connection by generating access token
-        logger.info("Trying to connect to {}".format(self.config.api_url))
+        logger.info(f"Trying to connect to {self.config.api_url}")
         self._generate_token()
 
     def _generate_token(self):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
index 71e5bae5e9b76..e8b56a01944ad 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
@@ -332,7 +332,7 @@ def _map_user_counts(
                     and self.config.email_domain
                     and user_count["user_name"]
                 ):
-                    user_email = "{0}@{1}".format(
+                    user_email = "{}@{}".format(
                         user_count["user_name"], self.config.email_domain
                     ).lower()
                 if not user_email or not self.config.user_email_pattern.allowed(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py
index af8d8824a4b17..5708b9f168c51 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py
@@ -37,7 +37,7 @@ def get_connection(self) -> SnowflakeConnection:
 class SnowflakeQueryMixin:
     def query(self: SnowflakeQueryProtocol, query: str) -> Any:
         try:
-            self.logger.debug("Query : {}".format(query))
+            self.logger.debug(f"Query : {query}")
             resp = self.get_connection().cursor(DictCursor).execute(query)
             return resp
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 9344e030d749f..25626d434f2ef 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -366,7 +366,7 @@ class SnowflakePrivilege:
             object_type: str
 
         def query(query):
-            logger.info("Query : {}".format(query))
+            logger.info(f"Query : {query}")
             resp = conn.cursor().execute(query)
             return resp
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py
index 84c1d3844a7b4..b2c40f914bddc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py
@@ -286,7 +286,7 @@ def get_view_names(self, connection, schema=None, **kw):
 # when reflecting schema for multiple tables at once.
 @reflection.cache  # type: ignore
 def _get_schema_column_info(self, connection, schema=None, **kw):
-    schema_clause = "database = '{schema}'".format(schema=schema) if schema else "1"
+    schema_clause = f"database = '{schema}'" if schema else "1"
     all_columns = defaultdict(list)
     result = connection.execute(
         text(
@@ -346,7 +346,7 @@ def _get_column_info(self, name, format_type, comment):
 @reflection.cache  # type: ignore
 def get_columns(self, connection, table_name, schema=None, **kw):
     if not schema:
-        query = "DESCRIBE TABLE {}".format(self._quote_table_name(table_name))
+        query = f"DESCRIBE TABLE {self._quote_table_name(table_name)}"
         cols = self._execute(connection, query)
     else:
         cols = self._get_clickhouse_columns(connection, table_name, schema, **kw)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
index 003732236ba80..95ce534968df5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
@@ -74,7 +74,9 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw):
                 coltype = _type_map[col_type]
             except KeyError:
                 util.warn(
-                    "Did not recognize type '%s' of column '%s'" % (col_type, col_name)
+                    "Did not recognize type '{}' of column '{}'".format(
+                        col_type, col_name
+                    )
                 )
                 coltype = types.NullType  # type: ignore
             result.append(
@@ -112,7 +114,7 @@ def get_view_definition_patched(self, connection, view_name, schema=None, **kw):
             self.identifier_preparer.quote_identifier(schema),
             self.identifier_preparer.quote_identifier(view_name),
         )
-    row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone()
+    row = connection.execute(f"SHOW CREATE TABLE {full_table}").fetchone()
     return row[0]
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py
index 0a67d6228e6db..dcc1340c81d7b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py
@@ -226,13 +226,13 @@ def get_columns(
                 col.default_on_null,
                 (
                     SELECT id.generation_type || ',' || id.IDENTITY_OPTIONS
-                    FROM DBA_TAB_IDENTITY_COLS%(dblink)s id
+                    FROM DBA_TAB_IDENTITY_COLS{dblink} id
                     WHERE col.table_name = id.table_name
                     AND col.column_name = id.column_name
                     AND col.owner = id.owner
-                ) AS identity_options""" % {
-                "dblink": dblink
-            }
+                ) AS identity_options""".format(
+                dblink=dblink
+            )
         else:
             identity_cols = "NULL as default_on_null, NULL as identity_options"
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index 59819db8b2dc9..3091791551827 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -326,7 +326,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
     """A Base class for all SQL Sources that use SQLAlchemy to extend"""
 
     def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str):
-        super(SQLAlchemySource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.config = config
         self.platform = platform
         self.report: SQLSourceReport = SQLSourceReport()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py
index 16655d1748287..f45147223b888 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py
@@ -210,8 +210,7 @@ def gen_lineage(
             ).as_workunit()
         ]
 
-        for wu in lineage_workunits:
-            yield wu
+        yield from lineage_workunits
 
 
 # downgrade a schema field
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
index e1c47acbc4b87..c79af14780874 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
@@ -86,7 +86,7 @@
     register_custom_type(datatype.JSON, RecordTypeClass)
 
 
-@functools.lru_cache()
+@functools.lru_cache
 def gen_catalog_connector_dict(engine: Engine) -> Dict[str, str]:
     query = dedent(
         """
@@ -473,7 +473,7 @@ def _parse_struct_fields(parts):
         "type": "record",
         "name": "__struct_{}".format(str(uuid.uuid4()).replace("-", "")),
         "fields": fields,
-        "native_data_type": "ROW({})".format(parts),
+        "native_data_type": f"ROW({parts})",
     }
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
index 738cc7e321764..7534f1295c528 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
@@ -123,7 +123,7 @@ def clean_host_port(cls, v):
 class VerticaSource(SQLAlchemySource):
     def __init__(self, config: VerticaConfig, ctx: PipelineContext):
         # self.platform = platform
-        super(VerticaSource, self).__init__(config, ctx, "vertica")
+        super().__init__(config, ctx, "vertica")
         self.report: SQLSourceReport = VerticaSourceReport()
         self.config: VerticaConfig = config
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index e0b442387d3b6..1d44fb6122a36 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -924,8 +924,7 @@ def get_connection_objects(
 
                 offset += count
 
-                for obj in connection_objects.get(c.NODES) or []:
-                    yield obj
+                yield from connection_objects.get(c.NODES) or []
 
     def emit_workbooks(self) -> Iterable[MetadataWorkUnit]:
         if self.tableau_project_registry:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py
index 140698a6c4b10..c99fe3b09c5bb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py
@@ -332,7 +332,7 @@ def _get_table_info(self, schema_name: str, table_name: str) -> dict:
                     properties[col_name] = data_type.strip()
                 else:
                     # col_name == "", data_type is not None
-                    prop_name = "{} {}".format(active_heading, data_type.rstrip())
+                    prop_name = f"{active_heading} {data_type.rstrip()}"
                     properties[prop_name] = value.rstrip()
         except Exception as e:
             self.report.report_warning(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
index f3aeb34002f3f..f1f0b5ddb4475 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
@@ -163,7 +163,7 @@ def get_report(self) -> UnityCatalogReport:
         return self.report
 
     def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig):
-        super(UnityCatalogSource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
 
         self.config = config
         self.report: UnityCatalogReport = UnityCatalogReport()
diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py
index 1e949affd1766..8ef61ab9679e6 100644
--- a/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py
+++ b/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py
@@ -42,9 +42,9 @@ def _get_current_owner_urns(self, entity_urn: str) -> Set[str]:
         if self.ctx.graph is not None:
             current_ownership = self.ctx.graph.get_ownership(entity_urn=entity_urn)
             if current_ownership is not None:
-                current_owner_urns: Set[str] = set(
-                    [owner.owner for owner in current_ownership.owners]
-                )
+                current_owner_urns: Set[str] = {
+                    owner.owner for owner in current_ownership.owners
+                }
                 return current_owner_urns
             else:
                 return set()
diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py
index f76d145a87043..94501b0d499b7 100644
--- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py
+++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py
@@ -769,9 +769,7 @@ def make_dataset_urn_from_sqlalchemy_uri(
             )
             return None
         schema_name = (
-            schema_name
-            if exclude_dbname
-            else "{}.{}".format(url_instance.database, schema_name)
+            schema_name if exclude_dbname else f"{url_instance.database}.{schema_name}"
         )
     elif data_platform == "mssql":
         schema_name = schema_name or "dbo"
@@ -781,9 +779,7 @@ def make_dataset_urn_from_sqlalchemy_uri(
             )
             return None
         schema_name = (
-            schema_name
-            if exclude_dbname
-            else "{}.{}".format(url_instance.database, schema_name)
+            schema_name if exclude_dbname else f"{url_instance.database}.{schema_name}"
         )
     elif data_platform in ["trino", "snowflake"]:
         if schema_name is None or url_instance.database is None:
@@ -804,9 +800,7 @@ def make_dataset_urn_from_sqlalchemy_uri(
         if database_name.endswith(f"/{schema_name}"):
             database_name = database_name[: -len(f"/{schema_name}")]
         schema_name = (
-            schema_name
-            if exclude_dbname
-            else "{}.{}".format(database_name, schema_name)
+            schema_name if exclude_dbname else f"{database_name}.{schema_name}"
         )
 
     elif data_platform == "bigquery":
@@ -817,7 +811,7 @@ def make_dataset_urn_from_sqlalchemy_uri(
                 )
             )
             return None
-        schema_name = "{}.{}".format(url_instance.host, url_instance.database)
+        schema_name = f"{url_instance.host}.{url_instance.database}"
 
     schema_name = schema_name or url_instance.database
     if schema_name is None:
@@ -853,7 +847,7 @@ class DecimalEncoder(json.JSONEncoder):
     def default(self, o):
         if isinstance(o, Decimal):
             return str(o)
-        return super(DecimalEncoder, self).default(o)
+        return super().default(o)
 
 
 def convert_to_string(var: Any) -> str:
diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py
index 5e2e510533af1..ae5d83c2dfc94 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py
@@ -81,7 +81,7 @@ def includes_temp_tables(self) -> bool:
         return False
 
     def get_urns(self) -> Set[str]:
-        return set(k for k, v in self._schema_cache.items() if v is not None)
+        return {k for k, v in self._schema_cache.items() if v is not None}
 
     def schema_count(self) -> int:
         return int(
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
index 911ab7136ed10..c112f5b74ac51 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
@@ -919,8 +919,8 @@ def _sqlglot_lineage_inner(
     # TODO: Can we generate a common WHERE clauses section?
 
     # Convert TableName to urns.
-    in_urns = sorted(set(table_name_urn_mapping[table] for table in tables))
-    out_urns = sorted(set(table_name_urn_mapping[table] for table in modified))
+    in_urns = sorted({table_name_urn_mapping[table] for table in tables})
+    out_urns = sorted({table_name_urn_mapping[table] for table in modified})
     column_lineage_urns = None
     if column_lineage:
         column_lineage_urns = [
diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py
index 08df9e80ecf29..69a790b3d9bc7 100644
--- a/metadata-ingestion/src/datahub/telemetry/telemetry.py
+++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py
@@ -174,7 +174,7 @@ def update_config(self) -> bool:
                         indent=2,
                     )
                 return True
-            except IOError as x:
+            except OSError as x:
                 if x.errno == errno.ENOENT:
                     logger.debug(
                         f"{CONFIG_FILE} does not exist and could not be created. Please check permissions on the parent folder."
@@ -215,12 +215,12 @@ def load_config(self) -> bool:
         """
 
         try:
-            with open(CONFIG_FILE, "r") as f:
+            with open(CONFIG_FILE) as f:
                 config = json.load(f)
                 self.client_id = config["client_id"]
                 self.enabled = config["enabled"] & ENV_ENABLED
                 return True
-        except IOError as x:
+        except OSError as x:
             if x.errno == errno.ENOENT:
                 logger.debug(
                     f"{CONFIG_FILE} does not exist and could not be created. Please check permissions on the parent folder."
diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
index d264a3970fdde..bb2b827dc06c3 100644
--- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
+++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
@@ -15,6 +15,7 @@
     Any,
     Callable,
     Dict,
+    Final,
     Generic,
     Iterator,
     List,
@@ -28,8 +29,6 @@
     Union,
 )
 
-from typing_extensions import Final
-
 from datahub.ingestion.api.closeable import Closeable
 
 logger: logging.Logger = logging.getLogger(__name__)
diff --git a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py
index 7c0f26706ebfa..447587bea8c40 100644
--- a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py
+++ b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py
@@ -71,10 +71,8 @@ def _parse_datatype_string(
             parts = HiveColumnToAvroConverter._ignore_brackets_split(s[4:-1], ",")
             if len(parts) != 2:
                 raise ValueError(
-                    (
-                        "The map type string format is: 'map<key_type,value_type>', "
-                        + f"but got: {s}"
-                    )
+                    "The map type string format is: 'map<key_type,value_type>', "
+                    + f"but got: {s}"
                 )
 
             kt = HiveColumnToAvroConverter._parse_datatype_string(parts[0])
@@ -126,10 +124,8 @@ def _parse_struct_fields_string(s: str, **kwargs: Any) -> Dict[str, object]:
             )
             if len(name_and_type) != 2:
                 raise ValueError(
-                    (
-                        "The struct field string format is: 'field_name:field_type', "
-                        + f"but got: {part}"
-                    )
+                    "The struct field string format is: 'field_name:field_type', "
+                    + f"but got: {part}"
                 )
 
             field_name = name_and_type[0].strip()
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index e79bbbe995aae..26511d9e5df1a 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -55,7 +55,7 @@ def test_bigquery_v2_ingest(
     tmp_path,
 ):
     test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2"
-    mcp_golden_path = "{}/bigquery_mcp_golden.json".format(test_resources_dir)
+    mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_golden.json"
     mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_output.json")
 
     get_datasets_for_project_id.return_value = [
diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py
index 5f7d65f5b2377..941315fcfa9d5 100644
--- a/metadata-ingestion/tests/integration/dbt/test_dbt.py
+++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py
@@ -232,13 +232,13 @@ def test_dbt_ingest(
     config: DbtTestConfig = dbt_test_config
     test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"
 
-    with open(test_resources_dir / "dbt_manifest.json", "r") as f:
+    with open(test_resources_dir / "dbt_manifest.json") as f:
         requests_mock.get("http://some-external-repo/dbt_manifest.json", text=f.read())
 
-    with open(test_resources_dir / "dbt_catalog.json", "r") as f:
+    with open(test_resources_dir / "dbt_catalog.json") as f:
         requests_mock.get("http://some-external-repo/dbt_catalog.json", text=f.read())
 
-    with open(test_resources_dir / "dbt_sources.json", "r") as f:
+    with open(test_resources_dir / "dbt_sources.json") as f:
         requests_mock.get("http://some-external-repo/dbt_sources.json", text=f.read())
 
     config.set_paths(
diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py
index cf1f649825e0c..773e84cbf7488 100644
--- a/metadata-ingestion/tests/integration/git/test_git_clone.py
+++ b/metadata-ingestion/tests/integration/git/test_git_clone.py
@@ -123,15 +123,13 @@ def test_git_clone_private(tmp_path):
         branch="d380a2b777ec6f4653626f39c68dba85893faa74",
     )
     assert checkout_dir.exists()
-    assert set(os.listdir(checkout_dir)) == set(
-        [
-            ".datahub",
-            "models",
-            "README.md",
-            ".github",
-            ".git",
-            "views",
-            "manifest_lock.lkml",
-            "manifest.lkml",
-        ]
-    )
+    assert set(os.listdir(checkout_dir)) == {
+        ".datahub",
+        "models",
+        "README.md",
+        ".github",
+        ".git",
+        "views",
+        "manifest_lock.lkml",
+        "manifest.lkml",
+    }
diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py
index a9ab43169405d..24a636077bfdd 100644
--- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py
+++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py
@@ -31,9 +31,7 @@ def remove_docker_image():
 def spark_submit(file_path: str, args: str = "") -> None:
     docker = "docker"
     command = f"{docker} exec spark-iceberg spark-submit {file_path} {args}"
-    ret = subprocess.run(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-    )
+    ret = subprocess.run(command, shell=True, capture_output=True)
     assert ret.returncode == 0
 
 
diff --git a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py
index a2015eb06b569..26f3d50c1167b 100644
--- a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py
+++ b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py
@@ -88,9 +88,7 @@ def test_resources_dir(pytestconfig):
 def loaded_kafka_connect(kafka_connect_runner):
     # # Setup mongo cluster
     command = "docker exec test_mongo mongosh test_db -f /scripts/mongo-init.js"
-    ret = subprocess.run(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-    )
+    ret = subprocess.run(command, shell=True, capture_output=True)
     assert ret.returncode == 0
 
     # Creating MySQL source with no transformations , only topic prefix
@@ -298,9 +296,7 @@ def loaded_kafka_connect(kafka_connect_runner):
     assert r.status_code == 201  # Created
 
     command = "docker exec test_mongo mongosh test_db -f /scripts/mongo-populate.js"
-    ret = subprocess.run(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-    )
+    ret = subprocess.run(command, shell=True, capture_output=True)
     assert ret.returncode == 0
 
     # Creating S3 Sink source
diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka_state.py b/metadata-ingestion/tests/integration/kafka/test_kafka_state.py
index 6dfc0427f76c1..24e81fbf128b0 100644
--- a/metadata-ingestion/tests/integration/kafka/test_kafka_state.py
+++ b/metadata-ingestion/tests/integration/kafka/test_kafka_state.py
@@ -40,9 +40,9 @@ def create_kafka_topics(self, topics: List[NewTopic]) -> None:
         for topic, f in fs.items():
             try:
                 f.result()  # The result itself is None
-                print("Topic {} created".format(topic))
+                print(f"Topic {topic} created")
             except Exception as e:
-                print("Failed to create topic {}: {}".format(topic, e))
+                print(f"Failed to create topic {topic}: {e}")
                 raise e
 
     def delete_kafka_topics(self, topics: List[str]) -> None:
@@ -60,11 +60,11 @@ def delete_kafka_topics(self, topics: List[str]) -> None:
         for topic, f in fs.items():
             try:
                 f.result()  # The result itself is None
-                print("Topic {} deleted".format(topic))
+                print(f"Topic {topic} deleted")
             except Exception as e:
                 # this error should be ignored when we already deleted
                 # the topic within the test code
-                print("Failed to delete topic {}: {}".format(topic, e))
+                print(f"Failed to delete topic {topic}: {e}")
 
     def __enter__(self):
         topics = [
diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py
index 5e0973a007f3a..1c1f0fec3eebb 100644
--- a/metadata-ingestion/tests/integration/lookml/test_lookml.py
+++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py
@@ -158,7 +158,7 @@ def test_lookml_explore_refinement(pytestconfig, tmp_path, mock_time):
             {"name": "+book", "extends__all": [["order"]]},
             {"name": "+book", "extends__all": [["transaction"]]},
         ],
-        connection=str(),
+        connection="",
         resolved_includes=[],
         includes=[],
     )
diff --git a/metadata-ingestion/tests/integration/metabase/test_metabase.py b/metadata-ingestion/tests/integration/metabase/test_metabase.py
index edb23c1fb7a1c..b39550f3d048a 100644
--- a/metadata-ingestion/tests/integration/metabase/test_metabase.py
+++ b/metadata-ingestion/tests/integration/metabase/test_metabase.py
@@ -68,7 +68,7 @@ def get(self, url):
 
     def raise_for_status(self):
         if self.error_list is not None and self.url in self.error_list:
-            http_error_msg = "%s Client Error: %s for url: %s" % (
+            http_error_msg = "{} Client Error: {} for url: {}".format(
                 400,
                 "Simulate error",
                 self.url,
diff --git a/metadata-ingestion/tests/integration/mode/test_mode.py b/metadata-ingestion/tests/integration/mode/test_mode.py
index cfd9751ab9f15..def7277494fe7 100644
--- a/metadata-ingestion/tests/integration/mode/test_mode.py
+++ b/metadata-ingestion/tests/integration/mode/test_mode.py
@@ -51,7 +51,7 @@ def get(self, url):
 
     def raise_for_status(self):
         if self.error_list is not None and self.url in self.error_list:
-            http_error_msg = "%s Client Error: %s for url: %s" % (
+            http_error_msg = "{} Client Error: {} for url: {}".format(
                 400,
                 "Simulate error",
                 self.url,
diff --git a/metadata-ingestion/tests/integration/oracle/common.py b/metadata-ingestion/tests/integration/oracle/common.py
index c2591bd1d5b0d..79dbda8c30f89 100644
--- a/metadata-ingestion/tests/integration/oracle/common.py
+++ b/metadata-ingestion/tests/integration/oracle/common.py
@@ -212,7 +212,7 @@ def get_recipe_sink(self, output_path: str) -> dict:
         }
 
     def get_output_mce_path(self):
-        return "{}/{}".format(self.tmp_path, self.mces_output_file_name)
+        return f"{self.tmp_path}/{self.mces_output_file_name}"
 
     def get_mock_data_impl(self):
         return self.default_mock_data
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
index 7b8441a1a8150..30c4b2bec3a04 100644
--- a/metadata-ingestion/tests/integration/snowflake/common.py
+++ b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -230,7 +230,7 @@ def default_query_results(  # noqa: C901
         return [
             {
                 "TABLE_SCHEMA": "TEST_SCHEMA",
-                "TABLE_NAME": "TABLE_{}".format(tbl_idx),
+                "TABLE_NAME": f"TABLE_{tbl_idx}",
                 "TABLE_TYPE": "BASE TABLE",
                 "CREATED": datetime(2021, 6, 8, 0, 0, 0, 0),
                 "LAST_ALTERED": datetime(2021, 6, 8, 0, 0, 0, 0),
@@ -245,7 +245,7 @@ def default_query_results(  # noqa: C901
         return [
             {
                 "schema_name": "TEST_SCHEMA",
-                "name": "VIEW_{}".format(view_idx),
+                "name": f"VIEW_{view_idx}",
                 "created_on": datetime(2021, 6, 8, 0, 0, 0, 0),
                 "comment": "Comment for View",
                 "text": f"create view view_{view_idx} as select * from table_{view_idx}",
@@ -257,13 +257,13 @@ def default_query_results(  # noqa: C901
     elif query in [
         *[
             SnowflakeQuery.columns_for_table(
-                "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB"
+                f"TABLE_{tbl_idx}", "TEST_SCHEMA", "TEST_DB"
             )
             for tbl_idx in range(1, num_tables + 1)
         ],
         *[
             SnowflakeQuery.columns_for_table(
-                "VIEW_{}".format(view_idx), "TEST_SCHEMA", "TEST_DB"
+                f"VIEW_{view_idx}", "TEST_SCHEMA", "TEST_DB"
             )
             for view_idx in range(1, num_views + 1)
         ],
@@ -273,7 +273,7 @@ def default_query_results(  # noqa: C901
                 # "TABLE_CATALOG": "TEST_DB",
                 # "TABLE_SCHEMA": "TEST_SCHEMA",
                 # "TABLE_NAME": "TABLE_{}".format(tbl_idx),
-                "COLUMN_NAME": "COL_{}".format(col_idx),
+                "COLUMN_NAME": f"COL_{col_idx}",
                 "ORDINAL_POSITION": col_idx,
                 "IS_NULLABLE": "NO",
                 "DATA_TYPE": "TEXT" if col_idx > 1 else "NUMBER",
@@ -317,7 +317,7 @@ def default_query_results(  # noqa: C901
                     [
                         {
                             "columns": [
-                                {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                                {"columnId": 0, "columnName": f"COL_{col_idx}"}
                                 for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
@@ -326,7 +326,7 @@ def default_query_results(  # noqa: C901
                         },
                         {
                             "columns": [
-                                {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                                {"columnId": 0, "columnName": f"COL_{col_idx}"}
                                 for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
@@ -335,7 +335,7 @@ def default_query_results(  # noqa: C901
                         },
                         {
                             "columns": [
-                                {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                                {"columnId": 0, "columnName": f"COL_{col_idx}"}
                                 for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
@@ -348,7 +348,7 @@ def default_query_results(  # noqa: C901
                     [
                         {
                             "columns": [
-                                {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                                {"columnId": 0, "columnName": f"COL_{col_idx}"}
                                 for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
@@ -357,7 +357,7 @@ def default_query_results(  # noqa: C901
                         },
                         {
                             "columns": [
-                                {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                                {"columnId": 0, "columnName": f"COL_{col_idx}"}
                                 for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
@@ -366,7 +366,7 @@ def default_query_results(  # noqa: C901
                         },
                         {
                             "columns": [
-                                {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                                {"columnId": 0, "columnName": f"COL_{col_idx}"}
                                 for col_idx in range(1, num_cols + 1)
                             ],
                             "objectDomain": "Table",
@@ -381,10 +381,10 @@ def default_query_results(  # noqa: C901
                             "columns": [
                                 {
                                     "columnId": 0,
-                                    "columnName": "COL_{}".format(col_idx),
+                                    "columnName": f"COL_{col_idx}",
                                     "directSources": [
                                         {
-                                            "columnName": "COL_{}".format(col_idx),
+                                            "columnName": f"COL_{col_idx}",
                                             "objectDomain": "Table",
                                             "objectId": 0,
                                             "objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
@@ -395,7 +395,7 @@ def default_query_results(  # noqa: C901
                             ],
                             "objectDomain": "Table",
                             "objectId": 0,
-                            "objectName": "TEST_DB.TEST_SCHEMA.TABLE_{}".format(op_idx),
+                            "objectName": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}",
                         }
                     ]
                 ),
@@ -456,11 +456,11 @@ def default_query_results(  # noqa: C901
     ):
         return [
             {
-                "DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_{}".format(op_idx),
+                "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}",
                 "UPSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_2",
                 "UPSTREAM_TABLE_COLUMNS": json.dumps(
                     [
-                        {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                        {"columnId": 0, "columnName": f"COL_{col_idx}"}
                         for col_idx in range(1, num_cols + 1)
                     ]
                 ),
@@ -468,10 +468,10 @@ def default_query_results(  # noqa: C901
                     [
                         {
                             "columnId": 0,
-                            "columnName": "COL_{}".format(col_idx),
+                            "columnName": f"COL_{col_idx}",
                             "directSources": [
                                 {
-                                    "columnName": "COL_{}".format(col_idx),
+                                    "columnName": f"COL_{col_idx}",
                                     "objectDomain": "Table",
                                     "objectId": 0,
                                     "objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
@@ -519,7 +519,7 @@ def default_query_results(  # noqa: C901
 
         return [
             {
-                "DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_{}".format(op_idx),
+                "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}",
                 "DOWNSTREAM_TABLE_DOMAIN": "TABLE",
                 "UPSTREAM_TABLES": json.dumps(
                     [
@@ -609,7 +609,7 @@ def default_query_results(  # noqa: C901
     ):
         return [
             {
-                "DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_{}".format(op_idx),
+                "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}",
                 "DOWNSTREAM_TABLE_DOMAIN": "TABLE",
                 "UPSTREAM_TABLES": json.dumps(
                     [
@@ -690,7 +690,7 @@ def default_query_results(  # noqa: C901
                 "VIEW_DOMAIN": "VIEW",
                 "VIEW_COLUMNS": json.dumps(
                     [
-                        {"columnId": 0, "columnName": "COL_{}".format(col_idx)}
+                        {"columnId": 0, "columnName": f"COL_{col_idx}"}
                         for col_idx in range(1, num_cols + 1)
                     ]
                 ),
@@ -699,10 +699,10 @@ def default_query_results(  # noqa: C901
                     [
                         {
                             "columnId": 0,
-                            "columnName": "COL_{}".format(col_idx),
+                            "columnName": f"COL_{col_idx}",
                             "directSources": [
                                 {
-                                    "columnName": "COL_{}".format(col_idx),
+                                    "columnName": f"COL_{col_idx}",
                                     "objectDomain": "Table",
                                     "objectId": 0,
                                     "objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
index 65c259e8acdc3..9760ea1a9c72b 100644
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
@@ -169,7 +169,7 @@ def test_snowflake_list_columns_error_causes_pipeline_warning(
             default_query_results,
             [
                 SnowflakeQuery.columns_for_table(
-                    "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB"
+                    f"TABLE_{tbl_idx}", "TEST_SCHEMA", "TEST_DB"
                 )
                 for tbl_idx in range(1, NUM_TABLES + 1)
             ],
diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
index f439a322c2677..4e9b4bee8ce6b 100644
--- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
+++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
@@ -23,9 +23,7 @@ def mssql_runner(docker_compose_runner, pytestconfig):
 
         # Run the setup.sql file to populate the database.
         command = "docker exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql"
-        ret = subprocess.run(
-            command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
+        ret = subprocess.run(command, shell=True, capture_output=True)
         assert ret.returncode == 0
         yield docker_services
 
diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py
index 563ccbee03c27..9ee4642bfe6eb 100644
--- a/metadata-ingestion/tests/test_helpers/mce_helpers.py
+++ b/metadata-ingestion/tests/test_helpers/mce_helpers.py
@@ -174,20 +174,16 @@ def get_entity_urns(events_file: str) -> Set[str]:
 def _get_entity_urns(events_list: List[Dict]) -> Set[str]:
     entity_type = "dataset"
     # mce urns
-    mce_urns = set(
-        [
-            _get_element(x, _get_mce_urn_path_spec(entity_type))
-            for x in events_list
-            if _get_filter(mce=True, entity_type=entity_type)(x)
-        ]
-    )
-    mcp_urns = set(
-        [
-            _get_element(x, _get_mcp_urn_path_spec())
-            for x in events_list
-            if _get_filter(mcp=True, entity_type=entity_type)(x)
-        ]
-    )
+    mce_urns = {
+        _get_element(x, _get_mce_urn_path_spec(entity_type))
+        for x in events_list
+        if _get_filter(mce=True, entity_type=entity_type)(x)
+    }
+    mcp_urns = {
+        _get_element(x, _get_mcp_urn_path_spec())
+        for x in events_list
+        if _get_filter(mcp=True, entity_type=entity_type)(x)
+    }
     all_urns = mce_urns.union(mcp_urns)
     return all_urns
 
@@ -268,20 +264,16 @@ def assert_for_each_entity(
     test_output = load_json_file(file)
     assert isinstance(test_output, list)
     # mce urns
-    mce_urns = set(
-        [
-            _get_element(x, _get_mce_urn_path_spec(entity_type))
-            for x in test_output
-            if _get_filter(mce=True, entity_type=entity_type)(x)
-        ]
-    )
-    mcp_urns = set(
-        [
-            _get_element(x, _get_mcp_urn_path_spec())
-            for x in test_output
-            if _get_filter(mcp=True, entity_type=entity_type)(x)
-        ]
-    )
+    mce_urns = {
+        _get_element(x, _get_mce_urn_path_spec(entity_type))
+        for x in test_output
+        if _get_filter(mce=True, entity_type=entity_type)(x)
+    }
+    mcp_urns = {
+        _get_element(x, _get_mcp_urn_path_spec())
+        for x in test_output
+        if _get_filter(mcp=True, entity_type=entity_type)(x)
+    }
     all_urns = mce_urns.union(mcp_urns)
     # there should not be any None urns
     assert None not in all_urns
@@ -378,20 +370,16 @@ def assert_entity_urn_not_like(entity_type: str, regex_pattern: str, file: str)
     test_output = load_json_file(file)
     assert isinstance(test_output, list)
     # mce urns
-    mce_urns = set(
-        [
-            _get_element(x, _get_mce_urn_path_spec(entity_type))
-            for x in test_output
-            if _get_filter(mce=True, entity_type=entity_type)(x)
-        ]
-    )
-    mcp_urns = set(
-        [
-            _get_element(x, _get_mcp_urn_path_spec())
-            for x in test_output
-            if _get_filter(mcp=True, entity_type=entity_type)(x)
-        ]
-    )
+    mce_urns = {
+        _get_element(x, _get_mce_urn_path_spec(entity_type))
+        for x in test_output
+        if _get_filter(mce=True, entity_type=entity_type)(x)
+    }
+    mcp_urns = {
+        _get_element(x, _get_mcp_urn_path_spec())
+        for x in test_output
+        if _get_filter(mcp=True, entity_type=entity_type)(x)
+    }
     all_urns = mce_urns.union(mcp_urns)
     print(all_urns)
     matched_urns = [u for u in all_urns if re.match(regex_pattern, u)]
@@ -406,20 +394,16 @@ def assert_entity_urn_like(entity_type: str, regex_pattern: str, file: str) -> i
     test_output = load_json_file(file)
     assert isinstance(test_output, list)
     # mce urns
-    mce_urns = set(
-        [
-            _get_element(x, _get_mce_urn_path_spec(entity_type))
-            for x in test_output
-            if _get_filter(mce=True, entity_type=entity_type)(x)
-        ]
-    )
-    mcp_urns = set(
-        [
-            _get_element(x, _get_mcp_urn_path_spec())
-            for x in test_output
-            if _get_filter(mcp=True, entity_type=entity_type)(x)
-        ]
-    )
+    mce_urns = {
+        _get_element(x, _get_mce_urn_path_spec(entity_type))
+        for x in test_output
+        if _get_filter(mce=True, entity_type=entity_type)(x)
+    }
+    mcp_urns = {
+        _get_element(x, _get_mcp_urn_path_spec())
+        for x in test_output
+        if _get_filter(mcp=True, entity_type=entity_type)(x)
+    }
     all_urns = mce_urns.union(mcp_urns)
     print(all_urns)
     matched_urns = [u for u in all_urns if re.match(regex_pattern, u)]
diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py
index f28c7167ca319..d995404ad69a5 100644
--- a/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py
+++ b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py
@@ -201,7 +201,7 @@ def test_auto_browse_path_v2_by_container_hierarchy(telemetry_ping_mock):
     assert paths["i"] == _make_container_browse_path_entries(["one", "a"])
 
     # Check urns emitted on demand -- not all at end
-    for urn in set(wu.get_urn() for wu in new_wus):
+    for urn in {wu.get_urn() for wu in new_wus}:
         try:
             idx = next(
                 i
diff --git a/metadata-ingestion/tests/unit/config/test_config_loader.py b/metadata-ingestion/tests/unit/config/test_config_loader.py
index f9a4076e18363..25ee289ec4e4e 100644
--- a/metadata-ingestion/tests/unit/config/test_config_loader.py
+++ b/metadata-ingestion/tests/unit/config/test_config_loader.py
@@ -52,7 +52,7 @@
                 "VAR1": "stuff1",
                 "VAR2": "stuff2",
             },
-            set(["VAR1", "UNSET_VAR3", "VAR2"]),
+            {"VAR1", "UNSET_VAR3", "VAR2"},
         ),
         (
             "tests/unit/config/complex_variable_expansion.yml",
@@ -107,22 +107,20 @@
                 "VAR10": "stuff10",
                 "VAR11": "stuff11",
             },
-            set(
-                [
-                    "VAR1",
-                    "VAR2",
-                    "VAR3",
-                    "VAR4",
-                    "VAR5",
-                    "VAR6",
-                    "VAR7",
-                    "VAR8",
-                    "VAR9",
-                    "VAR10",
-                    # VAR11 is escaped and hence not referenced
-                    "VARNONEXISTENT",
-                ]
-            ),
+            {
+                "VAR1",
+                "VAR2",
+                "VAR3",
+                "VAR4",
+                "VAR5",
+                "VAR6",
+                "VAR7",
+                "VAR8",
+                "VAR9",
+                "VAR10",
+                # VAR11 is escaped and hence not referenced
+                "VARNONEXISTENT",
+            },
         ),
     ],
 )
diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py
index 783b0fe18b29a..50d9b86b3a017 100644
--- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py
+++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py
@@ -73,7 +73,7 @@ class DummySource(StatefulIngestionSourceBase):
     reporter: DummySourceReport
 
     def __init__(self, config: DummySourceConfig, ctx: PipelineContext):
-        super(DummySource, self).__init__(config, ctx)
+        super().__init__(config, ctx)
         self.source_config = config
         self.reporter = DummySourceReport()
         # Create and register the stateful ingestion use-case handler.
diff --git a/metadata-ingestion/tests/unit/test_pipeline.py b/metadata-ingestion/tests/unit/test_pipeline.py
index 194a396edb310..bcc0f73a5c967 100644
--- a/metadata-ingestion/tests/unit/test_pipeline.py
+++ b/metadata-ingestion/tests/unit/test_pipeline.py
@@ -29,7 +29,7 @@
 pytestmark = pytest.mark.random_order(disabled=True)
 
 
-class TestPipeline(object):
+class TestPipeline:
     @patch("datahub.ingestion.source.kafka.KafkaSource.get_workunits", autospec=True)
     @patch("datahub.ingestion.sink.console.ConsoleSink.close", autospec=True)
     @freeze_time(FROZEN_TIME)
diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py
index 9e33ba6132e06..fc753f99b7e8f 100644
--- a/metadata-ingestion/tests/unit/test_snowflake_shares.py
+++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py
@@ -284,7 +284,7 @@ def test_snowflake_shares_workunit_outbound_share(
         ]
         entity_urns.add(wu.get_urn())
 
-    assert len((entity_urns)) == 6
+    assert len(entity_urns) == 6
 
 
 def test_snowflake_shares_workunit_inbound_and_outbound_share(
diff --git a/metadata-ingestion/tests/unit/utilities/test_advanced_thread_executor.py b/metadata-ingestion/tests/unit/utilities/test_advanced_thread_executor.py
index ae4616c604a61..7b51c18a85c5f 100644
--- a/metadata-ingestion/tests/unit/utilities/test_advanced_thread_executor.py
+++ b/metadata-ingestion/tests/unit/utilities/test_advanced_thread_executor.py
@@ -77,12 +77,12 @@ def test_backpressure_aware_executor_simple():
     def task(i):
         return i
 
-    assert set(
+    assert {
         res.result()
         for res in BackpressureAwareExecutor.map(
             task, ((i,) for i in range(10)), max_workers=2
         )
-    ) == set(range(10))
+    } == set(range(10))
 
 
 def test_backpressure_aware_executor_advanced():
@@ -119,7 +119,7 @@ def task(x, y):
         assert 2 <= len(executed) <= 4
 
         # Finally, consume the rest of the results.
-        assert set(r.result() for r in results) == {
+        assert {r.result() for r in results} == {
             i for i in range(10) if i != first_result.result()
         }
 
diff --git a/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py b/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py
index 0384e1f918881..bc915e21389a7 100644
--- a/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py
+++ b/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py
@@ -8,7 +8,7 @@
 def test_rate_is_limited():
     MAX_CALLS_PER_SEC = 5
     TOTAL_CALLS = 18
-    actual_calls: Dict[float, int] = defaultdict(lambda: 0)
+    actual_calls: Dict[float, int] = defaultdict(int)
 
     ratelimiter = RateLimiter(max_calls=MAX_CALLS_PER_SEC, period=1)
     for _ in range(TOTAL_CALLS):