Merge branch 'datahub-project:master' into master

acryldata · Nov 26, 2024 · 1c7106f · 1c7106f
2 parents 1e0a512 + 094433c
commit 1c7106f
Show file tree

Hide file tree

Showing 21 changed files with 1,336 additions and 1,409 deletions.
diff --git a/...va/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java b/...va/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java
@@ -386,7 +386,11 @@ private static EntityClient initMockEntityClient(
     Mockito.when(
             client.searchAcrossEntities(
                 any(),
-                Mockito.eq(entityTypes),
+                Mockito.argThat(
+                    argument ->
+                        argument != null
+                            && argument.containsAll(entityTypes)
+                            && entityTypes.containsAll(argument)),
                 Mockito.eq(query),
                 Mockito.eq(filter),
                 Mockito.eq(start),
@@ -409,7 +413,11 @@ private static void verifyMockEntityClient(
     Mockito.verify(mockClient, Mockito.times(1))
         .searchAcrossEntities(
             any(),
-            Mockito.eq(entityTypes),
+            Mockito.argThat(
+                argument ->
+                    argument != null
+                        && argument.containsAll(entityTypes)
+                        && entityTypes.containsAll(argument)),
             Mockito.eq(query),
             Mockito.eq(filter),
             Mockito.eq(start),

diff --git a/.../java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java b/.../java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java
@@ -462,7 +462,11 @@ private static EntityClient initMockEntityClient(
     Mockito.when(
             client.searchAcrossEntities(
                 any(),
-                Mockito.eq(entityTypes),
+                Mockito.argThat(
+                    argument ->
+                        argument != null
+                            && argument.containsAll(entityTypes)
+                            && entityTypes.containsAll(argument)),
                 Mockito.eq(query),
                 Mockito.eq(filter),
                 Mockito.eq(start),
@@ -483,7 +487,11 @@ private static void verifyMockEntityClient(
     Mockito.verify(mockClient, Mockito.times(1))
         .searchAcrossEntities(
             any(),
-            Mockito.eq(entityTypes),
+            Mockito.argThat(
+                argument ->
+                    argument != null
+                        && argument.containsAll(entityTypes)
+                        && entityTypes.containsAll(argument)),
             Mockito.eq(query),
             Mockito.eq(filter),
             Mockito.eq(start),

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -88,6 +88,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 ### Other Notable Changes
 
 - Downgrade to previous version is not automatically supported.
+- Data Product Properties Unset side effect introduced
+  - Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
+  - NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.
 
 ## 0.14.0.2
 

diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md
@@ -120,3 +120,6 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
     - (system / internal) Exclude form-prompt tests in live Metadata Tests evaluation
     - (system / internal) Exclude form-prompt tests in stored Metadata Test results
     - Elasticsearch reindex time limit of 8h removed
+    - Data Product Properties Unset side effect introduced
+      - Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
+      - NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.
diff --git a/metadata-ingestion/docs/sources/sigma/sigma_pre.md b/metadata-ingestion/docs/sources/sigma/sigma_pre.md
@@ -16,7 +16,7 @@ This source extracts the following:
 | Sigma                  | Datahub												         | Notes                            |
 |------------------------|---------------------------------------------------------------|----------------------------------|
 | `Workspace`            | [Container](../../metamodel/entities/container.md)     	     | SubType `"Sigma Workspace"`      |
-| `Workbook`             | [Container](../../metamodel/entities/container.md)            | SubType `"Sigma Workbook"`       |
+| `Workbook`             | [Dashboard](../../metamodel/entities/dashboard.md)            | SubType `"Sigma Workbook"`       |
 | `Page`                 | [Dashboard](../../metamodel/entities/dashboard.md)            |                                  |
 | `Element`              | [Chart](../../metamodel/entities/chart.md)                    |                                  |
 | `Dataset`              | [Dataset](../../metamodel/entities/dataset.md)                | SubType `"Sigma Dataset"`        |

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
     active_billable_bytes: Optional[int] = None
     long_term_billable_bytes: Optional[int] = None
     partition_info: Optional[PartitionInfo] = None
-    columns_ignore_from_profiling: List[str] = field(default_factory=list)
     external: bool = False
     constraints: List[BigqueryTableConstraint] = field(default_factory=list)
     table_type: Optional[str] = None

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -598,18 +598,6 @@ def _process_schema(
                     dataset_name=dataset_name,
                 )
 
-    # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
-    # because the profiler doesn't have access to columns
-    def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
-        ignore_list: List[str] = []
-        for column in columns:
-            if not column.data_type or any(
-                word in column.data_type.lower()
-                for word in ["array", "struct", "geography", "json"]
-            ):
-                ignore_list.append(column.field_path)
-        return ignore_list
-
     def _process_table(
         self,
         table: BigqueryTable,
@@ -631,15 +619,6 @@ def _process_table(
             )
         table.column_count = len(columns)
 
-        # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
-        if (
-            self.config.is_profiling_enabled()
-            and not self.config.profiling.profile_table_level_only
-        ):
-            table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
-                columns
-            )
-
         if not table.column_count:
             logger.warning(
                 f"Table doesn't have any column or unable to get columns for table: {table_identifier}"

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
@@ -166,12 +166,6 @@ def get_workunits(
                 normalized_table_name = BigqueryTableIdentifier(
                     project_id=project_id, dataset=dataset, table=table.name
                 ).get_table_name()
-                for column in table.columns_ignore_from_profiling:
-                    # Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
-                    # We also filter columns without data type as it means that column is part of a complex type.
-                    self.config.profile_pattern.deny.append(
-                        f"^{normalized_table_name}.{column}$"
-                    )
 
                 if table.external and not self.config.profiling.profile_external_tables:
                     self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -7,6 +7,7 @@
 import functools
 import json
 import logging
+import re
 import threading
 import traceback
 import unittest.mock
@@ -123,6 +124,8 @@
 
 _datasource_connection_injection_lock = threading.Lock()
 
+NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$")
+
 
 @contextlib.contextmanager
 def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]:
@@ -165,11 +168,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
         return convert_to_json_serializable(element_values.fetchone()[0])
     elif self.engine.dialect.name.lower() == BIGQUERY:
         element_values = self.engine.execute(
-            sa.select(
-                [
-                    sa.func.coalesce(sa.text(f"APPROX_COUNT_DISTINCT(`{column}`)")),
-                ]
-            ).select_from(self._table)
+            sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
+                self._table
+            )
         )
         return convert_to_json_serializable(element_values.fetchone()[0])
     elif self.engine.dialect.name.lower() == SNOWFLAKE:
@@ -378,6 +379,9 @@ def _get_columns_to_profile(self) -> List[str]:
                 f"{self.dataset_name}.{col}"
             ):
                 ignored_columns_by_pattern.append(col)
+            # We try to ignore nested columns as well
+            elif not self.config.profile_nested_fields and "." in col:
+                ignored_columns_by_pattern.append(col)
             elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
                 ignored_columns_by_type.append(col)
             else:
@@ -407,9 +411,18 @@ def _get_columns_to_profile(self) -> List[str]:
         return columns_to_profile
 
     def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool:
-        return str(sqlalchemy_type) in _get_column_types_to_ignore(
-            self.dataset.engine.dialect.name
-        )
+        # We don't profiles columns with None types
+        if str(sqlalchemy_type) == "NULL":
+            return True
+
+        sql_type = str(sqlalchemy_type)
+
+        match = re.match(NORMALIZE_TYPE_PATTERN, sql_type)
+
+        if match:
+            sql_type = match.group(1)
+
+        return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name)
 
     @_run_with_query_combiner
     def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
@@ -1397,6 +1410,8 @@ def _get_ge_dataset(
 def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
     if dialect_name.lower() == POSTGRESQL:
         return ["JSON"]
+    elif dialect_name.lower() == BIGQUERY:
+        return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]
 
     return []
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -188,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig):
         ),
     )
 
+    profile_nested_fields: bool = Field(
+        default=False,
+        description="Whether to profile complex types like structs, arrays and maps. ",
+    )
+
     @pydantic.root_validator(pre=True)
     def deprecate_bigquery_temp_table_schema(cls, values):
         # TODO: Update docs to remove mention of this field.

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sigma/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/sigma/data_classes.py
@@ -80,6 +80,7 @@ class Workbook(BaseModel):
     path: str
     latestVersion: int
     workspaceId: Optional[str] = None
+    description: Optional[str] = None
     pages: List[Page] = []
     badge: Optional[str] = None