Merge branch 'datahub-project:master' into master

acryldata · Jul 9, 2024 · 98a82b8 · 98a82b8
2 parents 8b5ec06 + b6c7fe8
commit 98a82b8
Show file tree

Hide file tree

Showing 88 changed files with 6,741 additions and 2,106 deletions.
diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml
@@ -40,12 +40,12 @@ jobs:
                 "RyanHolstien",
                 "Kunal-kankriya",
                 "purnimagarg1",
-                "gaurav2733",
                 "dushayntAW",
-                "AvaniSiddhapuraAPT",
-                "akarsh991",
-                "shubhamjagtap639",
-                "mayurinehate"
+                "sagar-salvi-apptware",
+                "kushagra-apptware",
+                "Salman-Apptware",
+                "mayurinehate",
+                "noggi"
               ]'), 
               github.actor
             ) 

diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js
@@ -19,6 +19,11 @@ module.exports = {
       async: true,
       defer: true,
     },
+    {
+      src: "/scripts/rb2b.js",
+      async: true,
+      defer: true,
+    }
   ],
   noIndex: isSaas,
   customFields: {

diff --git a/docs-website/static/scripts/rb2b.js b/docs-website/static/scripts/rb2b.js
diff --git a/docs/actions/actions/slack.md b/docs/actions/actions/slack.md
@@ -190,7 +190,7 @@ Similar to the quickstart scenario, there are no specific software installation
 If you are using the `datahub-actions` library directly from Python, or the `datahub-actions` cli directly, then you need to first install the `slack` action plugin in your Python virtualenv. 
 
 ```
-pip install "datahub-actions[slack]"
+pip install "acryl-datahub-actions[slack]"
 ```
 
 Then run the action with a configuration file that you have modified to capture your credentials and configuration.

diff --git a/docs/actions/actions/teams.md b/docs/actions/actions/teams.md
@@ -95,7 +95,7 @@ Similar to the quickstart scenario, there are no specific software installation
 If you are using the `datahub-actions` library directly from Python, or the `datahub-actions` cli directly, then you need to first install the `teams` action plugin in your Python virtualenv. 
 
 ```
-pip install "datahub-actions[teams]"
+pip install "acryl-datahub-actions[teams]"
 ```
 
 Then run the action with a configuration file that you have modified to capture your credentials and configuration.

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -166,6 +166,7 @@
     "sql-metadata==2.2.2",
     *sqllineage_lib,
     "GitPython>2",
+    "python-liquid",
 }
 
 bigquery_common = {
@@ -343,6 +344,7 @@
     "feast": {
         "feast>=0.34.0,<1",
         "flask-openid>=1.3.0",
+        "dask[dataframe]<2024.7.0",
     },
     "glue": aws_common,
     # hdbcli is supported officially by SAP, sqlalchemy-hana is built on top but not officially supported
@@ -370,7 +372,7 @@
     "kafka-connect": sql_common | {"requests", "JPype1"},
     "ldap": {"python-ldap>=2.4"},
     "looker": looker_common,
-    "lookml": looker_common,
+    "lookml": looker_common | sqlglot_lib,
     "metabase": {"requests"} | sqlglot_lib,
     "mlflow": {
         "mlflow-skinny>=2.3.0",
@@ -511,7 +513,7 @@
     "flake8-tidy-imports>=4.3.0",
     "flake8-bugbear==23.3.12",
     "isort>=5.7.0",
-    "mypy==1.0.0",
+    "mypy==1.10.1",
     *test_api_requirements,
     pytest_dep,
     "pytest-asyncio>=0.16.0",

diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py
@@ -13,11 +13,17 @@
 from requests.models import Response
 from requests.sessions import Session
 
+import datahub
 from datahub.cli import config_utils
 from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.request_helper import make_curl_command
 from datahub.emitter.serialization_helper import post_json_transform
-from datahub.metadata.schema_classes import _Aspect
+from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
+    MetadataChangeEvent,
+    MetadataChangeProposal,
+)
+from datahub.metadata.schema_classes import SystemMetadataClass, _Aspect
 from datahub.utilities.urns.urn import Urn, guess_entity_type
 
 log = logging.getLogger(__name__)
@@ -689,3 +695,18 @@ def generate_access_token(
     return token_name, response.json().get("data", {}).get("createAccessToken", {}).get(
         "accessToken", None
     )
+
+
+def ensure_has_system_metadata(
+    event: Union[
+        MetadataChangeProposal, MetadataChangeProposalWrapper, MetadataChangeEvent
+    ]
+) -> None:
+    if event.systemMetadata is None:
+        event.systemMetadata = SystemMetadataClass()
+    metadata = event.systemMetadata
+    if metadata.properties is None:
+        metadata.properties = {}
+    props = metadata.properties
+    props["clientId"] = datahub.__package_name__
+    props["clientVersion"] = datahub.__version__
diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py
@@ -10,7 +10,11 @@
 from requests.adapters import HTTPAdapter, Retry
 from requests.exceptions import HTTPError, RequestException
 
-from datahub.cli.cli_utils import fixup_gms_url, get_system_auth
+from datahub.cli.cli_utils import (
+    ensure_has_system_metadata,
+    fixup_gms_url,
+    get_system_auth,
+)
 from datahub.configuration.common import ConfigurationError, OperationalError
 from datahub.emitter.generic_emitter import Emitter
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -228,12 +232,10 @@ def emit_mce(self, mce: MetadataChangeEvent) -> None:
         snapshot_fqn = (
             f"com.linkedin.metadata.snapshot.{mce.proposedSnapshot.RECORD_SCHEMA.name}"
         )
-        system_metadata_obj = {}
-        if mce.systemMetadata is not None:
-            system_metadata_obj = {
-                "lastObserved": mce.systemMetadata.lastObserved,
-                "runId": mce.systemMetadata.runId,
-            }
+        ensure_has_system_metadata(mce)
+        # To make lint happy
+        assert mce.systemMetadata is not None
+        system_metadata_obj = mce.systemMetadata.to_obj()
         snapshot = {
             "entity": {"value": {snapshot_fqn: mce_obj}},
             "systemMetadata": system_metadata_obj,
@@ -246,7 +248,7 @@ def emit_mcp(
         self, mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper]
     ) -> None:
         url = f"{self._gms_server}/aspects?action=ingestProposal"
-
+        ensure_has_system_metadata(mcp)
         mcp_obj = pre_json_transform(mcp.to_obj())
         payload = json.dumps({"proposal": mcp_obj})
 
@@ -256,6 +258,8 @@ def emit_mcps(
         self, mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]]
     ) -> None:
         url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
+        for mcp in mcps:
+            ensure_has_system_metadata(mcp)
 
         mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
         payload = json.dumps({"proposals": mcp_objs})

diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
@@ -629,7 +629,7 @@ def get_fields_from_schema(
                     logger.error(
                         "Failed to get fields from schema, continuing...", exc_info=e
                     )
-                    return []
+                    return
                 else:
                     raise
             json_type = cls._get_type_from_schema(jsonref_schema_dict)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -644,7 +644,7 @@ def _query_project_list(self) -> Iterable[BigqueryProject]:
                 "Maybe resourcemanager.projects.get permission is missing for the service account. "
                 "You can assign predefined roles/bigquery.metadataViewer role to your service account.",
             )
-            return []
+            return
 
         for project in projects:
             if self.config.project_id_pattern.allowed(project.id):

diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -221,7 +221,7 @@ def _get_dpi_workunits(
                 f"Status should be either SUCCESSFUL, FAILURE_WITH_TASK or CANCELED and it was "
                 f"{job.status}"
             )
-            return []
+            return
         result = status_result_map[job.status]
         start_timestamp_millis = job.start_time * 1000
         for mcp in dpi.generate_mcp(