Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Nov 26, 2024
2 parents 1e0a512 + 094433c commit 1c7106f
Show file tree
Hide file tree
Showing 21 changed files with 1,336 additions and 1,409 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,11 @@ private static EntityClient initMockEntityClient(
Mockito.when(
client.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand All @@ -409,7 +413,11 @@ private static void verifyMockEntityClient(
Mockito.verify(mockClient, Mockito.times(1))
.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,11 @@ private static EntityClient initMockEntityClient(
Mockito.when(
client.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand All @@ -483,7 +487,11 @@ private static void verifyMockEntityClient(
Mockito.verify(mockClient, Mockito.times(1))
.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand Down
3 changes: 3 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
### Other Notable Changes

- Downgrade to previous version is not automatically supported.
- Data Product Properties Unset side effect introduced
- Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
- NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.

## 0.14.0.2

Expand Down
3 changes: 3 additions & 0 deletions docs/managed-datahub/release-notes/v_0_3_7.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,6 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
- (system / internal) Exclude form-prompt tests in live Metadata Tests evaluation
- (system / internal) Exclude form-prompt tests in stored Metadata Test results
- Elasticsearch reindex time limit of 8h removed
- Data Product Properties Unset side effect introduced
- Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
- NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.
2 changes: 1 addition & 1 deletion metadata-ingestion/docs/sources/sigma/sigma_pre.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This source extracts the following:
| Sigma | Datahub | Notes |
|------------------------|---------------------------------------------------------------|----------------------------------|
| `Workspace` | [Container](../../metamodel/entities/container.md) | SubType `"Sigma Workspace"` |
| `Workbook` | [Container](../../metamodel/entities/container.md) | SubType `"Sigma Workbook"` |
| `Workbook` | [Dashboard](../../metamodel/entities/dashboard.md) | SubType `"Sigma Workbook"` |
| `Page` | [Dashboard](../../metamodel/entities/dashboard.md) | |
| `Element` | [Chart](../../metamodel/entities/chart.md) | |
| `Dataset` | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Sigma Dataset"` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
active_billable_bytes: Optional[int] = None
long_term_billable_bytes: Optional[int] = None
partition_info: Optional[PartitionInfo] = None
columns_ignore_from_profiling: List[str] = field(default_factory=list)
external: bool = False
constraints: List[BigqueryTableConstraint] = field(default_factory=list)
table_type: Optional[str] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -598,18 +598,6 @@ def _process_schema(
dataset_name=dataset_name,
)

# This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
# because the profiler doesn't have access to columns
def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
ignore_list: List[str] = []
for column in columns:
if not column.data_type or any(
word in column.data_type.lower()
for word in ["array", "struct", "geography", "json"]
):
ignore_list.append(column.field_path)
return ignore_list

def _process_table(
self,
table: BigqueryTable,
Expand All @@ -631,15 +619,6 @@ def _process_table(
)
table.column_count = len(columns)

# We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
if (
self.config.is_profiling_enabled()
and not self.config.profiling.profile_table_level_only
):
table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
columns
)

if not table.column_count:
logger.warning(
f"Table doesn't have any column or unable to get columns for table: {table_identifier}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,6 @@ def get_workunits(
normalized_table_name = BigqueryTableIdentifier(
project_id=project_id, dataset=dataset, table=table.name
).get_table_name()
for column in table.columns_ignore_from_profiling:
# Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
# We also filter columns without data type as it means that column is part of a complex type.
self.config.profile_pattern.deny.append(
f"^{normalized_table_name}.{column}$"
)

if table.external and not self.config.profiling.profile_external_tables:
self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import functools
import json
import logging
import re
import threading
import traceback
import unittest.mock
Expand Down Expand Up @@ -123,6 +124,8 @@

_datasource_connection_injection_lock = threading.Lock()

NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$")


@contextlib.contextmanager
def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]:
Expand Down Expand Up @@ -165,11 +168,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
return convert_to_json_serializable(element_values.fetchone()[0])
elif self.engine.dialect.name.lower() == BIGQUERY:
element_values = self.engine.execute(
sa.select(
[
sa.func.coalesce(sa.text(f"APPROX_COUNT_DISTINCT(`{column}`)")),
]
).select_from(self._table)
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
self._table
)
)
return convert_to_json_serializable(element_values.fetchone()[0])
elif self.engine.dialect.name.lower() == SNOWFLAKE:
Expand Down Expand Up @@ -378,6 +379,9 @@ def _get_columns_to_profile(self) -> List[str]:
f"{self.dataset_name}.{col}"
):
ignored_columns_by_pattern.append(col)
# We try to ignore nested columns as well
elif not self.config.profile_nested_fields and "." in col:
ignored_columns_by_pattern.append(col)
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
ignored_columns_by_type.append(col)
else:
Expand Down Expand Up @@ -407,9 +411,18 @@ def _get_columns_to_profile(self) -> List[str]:
return columns_to_profile

def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool:
return str(sqlalchemy_type) in _get_column_types_to_ignore(
self.dataset.engine.dialect.name
)
# We don't profiles columns with None types
if str(sqlalchemy_type) == "NULL":
return True

sql_type = str(sqlalchemy_type)

match = re.match(NORMALIZE_TYPE_PATTERN, sql_type)

if match:
sql_type = match.group(1)

return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name)

@_run_with_query_combiner
def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
Expand Down Expand Up @@ -1397,6 +1410,8 @@ def _get_ge_dataset(
def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
if dialect_name.lower() == POSTGRESQL:
return ["JSON"]
elif dialect_name.lower() == BIGQUERY:
return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]

return []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig):
),
)

profile_nested_fields: bool = Field(
default=False,
description="Whether to profile complex types like structs, arrays and maps. ",
)

@pydantic.root_validator(pre=True)
def deprecate_bigquery_temp_table_schema(cls, values):
# TODO: Update docs to remove mention of this field.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class Workbook(BaseModel):
path: str
latestVersion: int
workspaceId: Optional[str] = None
description: Optional[str] = None
pages: List[Page] = []
badge: Optional[str] = None

Expand Down
Loading

0 comments on commit 1c7106f

Please sign in to comment.