From a814cb389f364ab4a4782d3f4987fe047d07962a Mon Sep 17 00:00:00 2001
From: sagar-salvi-apptware
<159135491+sagar-salvi-apptware@users.noreply.github.com>
Date: Thu, 2 Jan 2025 17:59:22 +0530
Subject: [PATCH 1/8] fix(ingest/bigquery): All View generation when queries_v2
is turned off (#12181)
Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
---
docs/how/updating-datahub.md | 1 +
.../ingestion/source/bigquery_v2/bigquery.py | 28 +-
.../source/bigquery_v2/bigquery_config.py | 12 +-
.../source/bigquery_v2/bigquery_schema_gen.py | 28 +-
.../ingestion/source/bigquery_v2/lineage.py | 31 +-
.../bigquery_mcp_lineage_golden_1.json | 977 +++++++++++++++
.../bigquery_mcp_lineage_golden_2.json | 1064 +++++++++++++++++
.../integration/bigquery_v2/test_bigquery.py | 145 +++
8 files changed, 2220 insertions(+), 66 deletions(-)
create mode 100644 metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json
create mode 100644 metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index d6620fde0bf79..19261da23bcf9 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -18,6 +18,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
## Next
- #12191 - Configs `include_view_lineage` and `include_view_column_lineage` are removed from snowflake ingestion source. View and External Table DDL lineage will always be ingested when definitions are available.
+- #12181 - Configs `include_view_lineage`, `include_view_column_lineage` and `lineage_parse_view_ddl` are removed from bigquery ingestion source. View and Snapshot lineage will always be ingested when definitions are available.
- #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.
PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.
Entity urn with `include_workspace_name_in_dataset_urn: false`
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 16a5268a2dea7..38eab3606b7e9 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -206,9 +206,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
def _init_schema_resolver(self) -> SchemaResolver:
schema_resolution_required = (
- self.config.use_queries_v2
- or self.config.lineage_parse_view_ddl
- or self.config.lineage_use_sql_parser
+ self.config.use_queries_v2 or self.config.lineage_use_sql_parser
)
schema_ingestion_enabled = (
self.config.include_schema_metadata
@@ -255,18 +253,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
for project in projects:
yield from self.bq_schema_extractor.get_project_workunits(project)
- if self.config.use_queries_v2:
- # Always ingest View and Snapshot lineage with schema ingestion
- self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
-
- yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
- [p.id for p in projects],
- self.bq_schema_extractor.view_refs_by_project,
- self.bq_schema_extractor.view_definitions,
- self.bq_schema_extractor.snapshot_refs_by_project,
- self.bq_schema_extractor.snapshots_by_ref,
- )
+ self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
+ yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
+ [p.id for p in projects],
+ self.bq_schema_extractor.view_refs_by_project,
+ self.bq_schema_extractor.view_definitions,
+ self.bq_schema_extractor.snapshot_refs_by_project,
+ self.bq_schema_extractor.snapshots_by_ref,
+ )
+ if self.config.use_queries_v2:
# if both usage and lineage are disabled then skip queries extractor piece
if (
not self.config.include_usage_statistics
@@ -306,10 +302,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
if self.config.include_table_lineage:
yield from self.lineage_extractor.get_lineage_workunits(
[p.id for p in projects],
- self.bq_schema_extractor.view_refs_by_project,
- self.bq_schema_extractor.view_definitions,
- self.bq_schema_extractor.snapshot_refs_by_project,
- self.bq_schema_extractor.snapshots_by_ref,
self.bq_schema_extractor.table_refs,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index 4af41921c9fa3..ef323260b014e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -463,10 +463,6 @@ def have_table_data_read_permission(self) -> bool:
default=True,
description="Use sql parser to resolve view/table lineage.",
)
- lineage_parse_view_ddl: bool = Field(
- default=True,
- description="Sql parse view ddl to get lineage.",
- )
lineage_sql_parser_use_raw_names: bool = Field(
default=False,
@@ -572,11 +568,9 @@ def have_table_data_read_permission(self) -> bool:
"See [this](https://cloud.google.com/bigquery/docs/information-schema-jobs#scope_and_syntax) for details.",
)
- # include_view_lineage and include_view_column_lineage are inherited from SQLCommonConfig
- # but not used in bigquery so we hide them from docs.
- include_view_lineage: bool = Field(default=True, hidden_from_docs=True)
-
- include_view_column_lineage: bool = Field(default=True, hidden_from_docs=True)
+ _include_view_lineage = pydantic_removed_field("include_view_lineage")
+ _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
+ _lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl")
@root_validator(pre=True)
def set_include_schema_metadata(cls, values: Dict) -> Dict:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
index 4a3b47f6b543a..bc2688e6b481a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -653,14 +653,11 @@ def _process_view(
self.report.report_dropped(table_identifier.raw_table_name())
return
- if self.store_table_refs:
- table_ref = str(
- BigQueryTableRef(table_identifier).get_sanitized_table_ref()
- )
- self.table_refs.add(table_ref)
- if self.config.lineage_parse_view_ddl and view.view_definition:
- self.view_refs_by_project[project_id].add(table_ref)
- self.view_definitions[table_ref] = view.view_definition
+ table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
+ self.table_refs.add(table_ref)
+ if view.view_definition:
+ self.view_refs_by_project[project_id].add(table_ref)
+ self.view_definitions[table_ref] = view.view_definition
view.column_count = len(columns)
if not view.column_count:
@@ -701,14 +698,11 @@ def _process_snapshot(
f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}"
)
- if self.store_table_refs:
- table_ref = str(
- BigQueryTableRef(table_identifier).get_sanitized_table_ref()
- )
- self.table_refs.add(table_ref)
- if snapshot.base_table_identifier:
- self.snapshot_refs_by_project[project_id].add(table_ref)
- self.snapshots_by_ref[table_ref] = snapshot
+ table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
+ self.table_refs.add(table_ref)
+ if snapshot.base_table_identifier:
+ self.snapshot_refs_by_project[project_id].add(table_ref)
+ self.snapshots_by_ref[table_ref] = snapshot
yield from self.gen_snapshot_dataset_workunits(
table=snapshot,
@@ -1148,7 +1142,7 @@ def gen_schema_metadata(
foreignKeys=foreign_keys if foreign_keys else None,
)
- if self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser:
+ if self.config.lineage_use_sql_parser:
self.sql_parser_schema_resolver.add_schema_metadata(
dataset_urn, schema_metadata
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 321b1b6207fab..ba3357aa8ca20 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -291,16 +291,15 @@ def get_lineage_workunits_for_views_and_snapshots(
snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
) -> Iterable[MetadataWorkUnit]:
for project in projects:
- if self.config.lineage_parse_view_ddl:
- for view in view_refs_by_project[project]:
- self.datasets_skip_audit_log_lineage.add(view)
- self.aggregator.add_view_definition(
- view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
- BigQueryTableRef.from_string_name(view)
- ),
- view_definition=view_definitions[view],
- default_db=project,
- )
+ for view in view_refs_by_project[project]:
+ self.datasets_skip_audit_log_lineage.add(view)
+ self.aggregator.add_view_definition(
+ view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
+ BigQueryTableRef.from_string_name(view)
+ ),
+ view_definition=view_definitions[view],
+ default_db=project,
+ )
for snapshot_ref in snapshot_refs_by_project[project]:
snapshot = snapshots_by_ref[snapshot_ref]
@@ -322,23 +321,11 @@ def get_lineage_workunits_for_views_and_snapshots(
def get_lineage_workunits(
self,
projects: List[str],
- view_refs_by_project: Dict[str, Set[str]],
- view_definitions: FileBackedDict[str],
- snapshot_refs_by_project: Dict[str, Set[str]],
- snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
table_refs: Set[str],
) -> Iterable[MetadataWorkUnit]:
if not self._should_ingest_lineage():
return
- yield from self.get_lineage_workunits_for_views_and_snapshots(
- projects,
- view_refs_by_project,
- view_definitions,
- snapshot_refs_by_project,
- snapshots_by_ref,
- )
-
if self.config.use_exported_bigquery_audit_metadata:
projects = ["*"] # project_id not used when using exported metadata
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json
new file mode 100644
index 0000000000000..8f411ca513771
--- /dev/null
+++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json
@@ -0,0 +1,977 @@
+[
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "bigquery",
+ "env": "PROD",
+ "project_id": "project-id-1"
+ },
+ "name": "project-id-1",
+ "env": "PROD"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Project"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "bigquery",
+ "env": "PROD",
+ "project_id": "project-id-1",
+ "dataset_id": "bigquery-dataset-1",
+ "location": "US"
+ },
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1",
+ "name": "bigquery-dataset-1",
+ "env": "PROD"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Dataset"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "project-id-1.bigquery-dataset-1.table-1",
+ "platform": "urn:li:dataPlatform:bigquery",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "age",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INT",
+ "recursive": false,
+ "globalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:Test Policy Tag"
+ }
+ ]
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ },
+ {
+ "fieldPath": "email",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "STRING",
+ "recursive": false,
+ "globalTags": {
+ "tags": []
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1",
+ "name": "table-1",
+ "qualifiedName": "project-id-1.bigquery-dataset-1.table-1",
+ "description": "",
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Table"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ },
+ {
+ "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "project-id-1.bigquery-dataset-1.view-1",
+ "platform": "urn:li:dataPlatform:bigquery",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "age",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INT",
+ "recursive": false,
+ "globalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:Test Policy Tag"
+ }
+ ]
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ },
+ {
+ "fieldPath": "email",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "STRING",
+ "recursive": false,
+ "globalTags": {
+ "tags": []
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1",
+ "name": "view-1",
+ "qualifiedName": "project-id-1.bigquery-dataset-1.view-1",
+ "description": "",
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "View"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "viewProperties",
+ "aspect": {
+ "json": {
+ "materialized": false,
+ "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`",
+ "viewLanguage": "SQL"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ },
+ {
+ "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1",
+ "platform": "urn:li:dataPlatform:bigquery",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "age",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INT",
+ "recursive": false,
+ "globalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:Test Policy Tag"
+ }
+ ]
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ },
+ {
+ "fieldPath": "email",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "STRING",
+ "recursive": false,
+ "globalTags": {
+ "tags": []
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1",
+ "name": "snapshot-table-1",
+ "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1",
+ "description": "",
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Bigquery Table Snapshot"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1643871600000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "type": "COPY"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ },
+ {
+ "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1643871600000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "type": "VIEW",
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "queryProperties",
+ "aspect": {
+ "json": {
+ "statement": {
+ "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`",
+ "language": "SQL"
+ },
+ "source": "SYSTEM",
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "lastModified": {
+ "time": 1643871600000,
+ "actor": "urn:li:corpuser:_ingestion"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "querySubjects",
+ "aspect": {
+ "json": {
+ "subjects": [
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)"
+ },
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "tag",
+ "entityUrn": "urn:li:tag:Test Policy Tag",
+ "changeType": "UPSERT",
+ "aspectName": "tagKey",
+ "aspect": {
+ "json": {
+ "name": "Test Policy Tag"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-0mn4n3",
+ "lastRunId": "no-run-id-provided"
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json
new file mode 100644
index 0000000000000..26abc09569ccf
--- /dev/null
+++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json
@@ -0,0 +1,1064 @@
+[
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "bigquery",
+ "env": "PROD",
+ "project_id": "project-id-1"
+ },
+ "name": "project-id-1",
+ "env": "PROD"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Project"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "bigquery",
+ "env": "PROD",
+ "project_id": "project-id-1",
+ "dataset_id": "bigquery-dataset-1",
+ "location": "US"
+ },
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1",
+ "name": "bigquery-dataset-1",
+ "env": "PROD"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Dataset"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "project-id-1.bigquery-dataset-1.table-1",
+ "platform": "urn:li:dataPlatform:bigquery",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "age",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INT",
+ "recursive": false,
+ "globalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:Test Policy Tag"
+ }
+ ]
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ },
+ {
+ "fieldPath": "email",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "STRING",
+ "recursive": false,
+ "globalTags": {
+ "tags": []
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1",
+ "name": "table-1",
+ "qualifiedName": "project-id-1.bigquery-dataset-1.table-1",
+ "description": "",
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Table"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ },
+ {
+ "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "project-id-1.bigquery-dataset-1.view-1",
+ "platform": "urn:li:dataPlatform:bigquery",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "age",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INT",
+ "recursive": false,
+ "globalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:Test Policy Tag"
+ }
+ ]
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ },
+ {
+ "fieldPath": "email",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "STRING",
+ "recursive": false,
+ "globalTags": {
+ "tags": []
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1",
+ "name": "view-1",
+ "qualifiedName": "project-id-1.bigquery-dataset-1.view-1",
+ "description": "",
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "View"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "viewProperties",
+ "aspect": {
+ "json": {
+ "materialized": false,
+ "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`",
+ "viewLanguage": "SQL"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ },
+ {
+ "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "schemaMetadata",
+ "aspect": {
+ "json": {
+ "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1",
+ "platform": "urn:li:dataPlatform:bigquery",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "age",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INT",
+ "recursive": false,
+ "globalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:Test Policy Tag"
+ }
+ ]
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ },
+ {
+ "fieldPath": "email",
+ "nullable": false,
+ "description": "comment",
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "STRING",
+ "recursive": false,
+ "globalTags": {
+ "tags": []
+ },
+ "isPartOfKey": false,
+ "isPartitioningKey": false
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1",
+ "name": "snapshot-table-1",
+ "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1",
+ "description": "",
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Bigquery Table Snapshot"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1643871600000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "type": "COPY"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3",
+ "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3"
+ },
+ {
+ "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0",
+ "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1643871600000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "type": "VIEW",
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "queryProperties",
+ "aspect": {
+ "json": {
+ "statement": {
+ "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`",
+ "language": "SQL"
+ },
+ "source": "SYSTEM",
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "lastModified": {
+ "time": 1643871600000,
+ "actor": "urn:li:corpuser:_ingestion"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "querySubjects",
+ "aspect": {
+ "json": {
+ "subjects": [
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)"
+ },
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:bigquery"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetUsageStatistics",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1643760000000,
+ "eventGranularity": {
+ "unit": "DAY",
+ "multiple": 1
+ },
+ "partitionSpec": {
+ "partition": "FULL_TABLE_SNAPSHOT",
+ "type": "FULL_TABLE"
+ },
+ "uniqueUserCount": 0,
+ "totalSqlQueries": 0,
+ "topSqlQueries": [],
+ "userCounts": [],
+ "fieldCounts": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetUsageStatistics",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1643760000000,
+ "eventGranularity": {
+ "unit": "DAY",
+ "multiple": 1
+ },
+ "partitionSpec": {
+ "partition": "FULL_TABLE_SNAPSHOT",
+ "type": "FULL_TABLE"
+ },
+ "uniqueUserCount": 0,
+ "totalSqlQueries": 0,
+ "topSqlQueries": [],
+ "userCounts": [],
+ "fieldCounts": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "datasetUsageStatistics",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1643760000000,
+ "eventGranularity": {
+ "unit": "DAY",
+ "multiple": 1
+ },
+ "partitionSpec": {
+ "partition": "FULL_TABLE_SNAPSHOT",
+ "type": "FULL_TABLE"
+ },
+ "uniqueUserCount": 0,
+ "totalSqlQueries": 0,
+ "topSqlQueries": [],
+ "userCounts": [],
+ "fieldCounts": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "tag",
+ "entityUrn": "urn:li:tag:Test Policy Tag",
+ "changeType": "UPSERT",
+ "aspectName": "tagKey",
+ "aspect": {
+ "json": {
+ "name": "Test Policy Tag"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1643871600000,
+ "runId": "bigquery-2022_02_03-07_00_00-k4o1z9",
+ "lastRunId": "no-run-id-provided"
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index 1f14688636161..2dd320041a113 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -4,6 +4,7 @@
from typing import Any, Dict, Optional
from unittest.mock import MagicMock, patch
+import pytest
from freezegun import freeze_time
from google.cloud.bigquery.table import TableListItem
@@ -577,3 +578,147 @@ def test_bigquery_queries_v2_lineage_usage_ingest(
output_path=mcp_output_path,
golden_path=mcp_golden_path,
)
+
+
+@freeze_time(FROZEN_TIME)
+@patch.object(BigQuerySchemaApi, "get_snapshots_for_dataset")
+@patch.object(BigQuerySchemaApi, "get_views_for_dataset")
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigQuerySchemaGenerator, "get_core_table_details")
+@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id")
+@patch.object(BigQuerySchemaApi, "get_columns_for_dataset")
+@patch.object(BigQueryDataReader, "get_sample_data_for_table")
+@patch("google.cloud.bigquery.Client")
+@patch("google.cloud.datacatalog_v1.PolicyTagManagerClient")
+@patch("google.cloud.resourcemanager_v3.ProjectsClient")
+@pytest.mark.parametrize(
+ "use_queries_v2, include_table_lineage, include_usage_statistics, golden_file",
+ [
+ (True, False, False, "bigquery_mcp_lineage_golden_1.json"),
+ (True, True, False, "bigquery_mcp_lineage_golden_1.json"),
+ (False, False, True, "bigquery_mcp_lineage_golden_2.json"),
+ (False, True, True, "bigquery_mcp_lineage_golden_2.json"),
+ ],
+)
+def test_bigquery_lineage_v2_ingest_view_snapshots(
+ client,
+ policy_tag_manager_client,
+ projects_client,
+ get_sample_data_for_table,
+ get_columns_for_dataset,
+ get_datasets_for_project_id,
+ get_core_table_details,
+ get_tables_for_dataset,
+ get_views_for_dataset,
+ get_snapshots_for_dataset,
+ pytestconfig,
+ tmp_path,
+ use_queries_v2,
+ include_table_lineage,
+ include_usage_statistics,
+ golden_file,
+):
+ test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2"
+ mcp_golden_path = f"{test_resources_dir}/{golden_file}"
+ mcp_output_path = "{}/{}_output.json".format(tmp_path, golden_file)
+
+ dataset_name = "bigquery-dataset-1"
+ get_datasets_for_project_id.return_value = [
+ BigqueryDataset(name=dataset_name, location="US")
+ ]
+
+ table_list_item = TableListItem(
+ {"tableReference": {"projectId": "", "datasetId": "", "tableId": ""}}
+ )
+ table_name = "table-1"
+ snapshot_table_name = "snapshot-table-1"
+ view_name = "view-1"
+ get_core_table_details.return_value = {table_name: table_list_item}
+ columns = [
+ BigqueryColumn(
+ name="age",
+ ordinal_position=1,
+ is_nullable=False,
+ field_path="col_1",
+ data_type="INT",
+ comment="comment",
+ is_partition_column=False,
+ cluster_column_position=None,
+ policy_tags=["Test Policy Tag"],
+ ),
+ BigqueryColumn(
+ name="email",
+ ordinal_position=1,
+ is_nullable=False,
+ field_path="col_2",
+ data_type="STRING",
+ comment="comment",
+ is_partition_column=False,
+ cluster_column_position=None,
+ ),
+ ]
+
+ get_columns_for_dataset.return_value = {
+ table_name: columns,
+ snapshot_table_name: columns,
+ view_name: columns,
+ }
+ get_sample_data_for_table.return_value = {
+ "age": [random.randint(1, 80) for i in range(20)],
+ "email": [random_email() for i in range(20)],
+ }
+
+ bigquery_table = BigqueryTable(
+ name=table_name,
+ comment=None,
+ created=None,
+ last_altered=None,
+ size_in_bytes=None,
+ rows_count=None,
+ )
+ get_tables_for_dataset.return_value = iter([bigquery_table])
+
+ bigquery_view = BigqueryView(
+ name=view_name,
+ comment=None,
+ created=None,
+ view_definition=f"create view `{dataset_name}.view-1` as select email from `{dataset_name}.table-1`",
+ last_altered=None,
+ size_in_bytes=None,
+ rows_count=None,
+ materialized=False,
+ )
+
+ get_views_for_dataset.return_value = iter([bigquery_view])
+ snapshot_table = BigqueryTableSnapshot(
+ name=snapshot_table_name,
+ comment=None,
+ created=None,
+ last_altered=None,
+ size_in_bytes=None,
+ rows_count=None,
+ base_table_identifier=BigqueryTableIdentifier(
+ project_id="project-id-1",
+ dataset="bigquery-dataset-1",
+ table="table-1",
+ ),
+ )
+ get_snapshots_for_dataset.return_value = iter([snapshot_table])
+
+ pipeline_config_dict: Dict[str, Any] = recipe(
+ mcp_output_path=mcp_output_path,
+ source_config_override={
+ "use_queries_v2": use_queries_v2,
+ "include_table_lineage": include_table_lineage,
+ "include_usage_statistics": include_usage_statistics,
+ "classification": {"enabled": False},
+ },
+ )
+
+ run_and_get_pipeline(pipeline_config_dict)
+
+ mce_helpers.check_golden_file(
+ pytestconfig,
+ output_path=mcp_output_path,
+ golden_path=mcp_golden_path,
+ )
From 7f64ffd2f7541900bbcd2b7b5b3f6dde237a8667 Mon Sep 17 00:00:00 2001
From: sagar-salvi-apptware
<159135491+sagar-salvi-apptware@users.noreply.github.com>
Date: Thu, 2 Jan 2025 18:44:45 +0530
Subject: [PATCH 2/8] test(ingest/athena): add connector integration tests
(#12256)
---
.../integration/athena/athena_mce_golden.json | 1362 +++++++++++++++++
.../integration/athena/test_athena_source.py | 163 ++
2 files changed, 1525 insertions(+)
create mode 100644 metadata-ingestion/tests/integration/athena/athena_mce_golden.json
create mode 100644 metadata-ingestion/tests/integration/athena/test_athena_source.py
diff --git a/metadata-ingestion/tests/integration/athena/athena_mce_golden.json b/metadata-ingestion/tests/integration/athena/athena_mce_golden.json
new file mode 100644
index 0000000000000..1b3fdb0bdb253
--- /dev/null
+++ b/metadata-ingestion/tests/integration/athena/athena_mce_golden.json
@@ -0,0 +1,1362 @@
+[
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "athena",
+ "env": "PROD",
+ "database": "test_schema"
+ },
+ "name": "test_schema",
+ "env": "PROD"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:athena"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Database"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "proposedSnapshot": {
+ "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)",
+ "aspects": [
+ {
+ "com.linkedin.pegasus2avro.common.Status": {
+ "removed": false
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
+ "customProperties": {
+ "key": "value",
+ "table_type": "EXTERNAL_TABLE",
+ "is_view": "True",
+ "view_definition": "CREATE VIEW \"test_schema\".test_view_1 AS\nSELECT *\nFROM\n \"test_schema\".\"test_table\""
+ },
+ "name": "test_table",
+ "description": "Test table description",
+ "tags": []
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
+ "schemaName": "test_schema.test_table",
+ "platform": "urn:li:dataPlatform:athena",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.pegasus2avro.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "[version=2.0].[type=string].employee_id",
+ "nullable": false,
+ "description": "Unique identifier for the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=long].annual_salary",
+ "nullable": true,
+ "description": "Annual salary of the employee in USD",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "BIGINT",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"BIGINT\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=string].employee_name",
+ "nullable": false,
+ "description": "Full name of the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history",
+ "nullable": true,
+ "description": "Job history map: year to details (company, role)",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.MapType": {
+ "keyType": "string",
+ "valueType": "record"
+ }
+ }
+ },
+ "nativeDataType": "MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=int].year",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INTEGER",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"INTEGER\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].company",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].role",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=long].department_budgets",
+ "nullable": true,
+ "description": "Map of department names to their respective budgets",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.MapType": {
+ "keyType": "string",
+ "valueType": "long"
+ }
+ }
+ },
+ "nativeDataType": "MapType(String(), BIGINT())",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"MapType(String(), BIGINT())\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].skills",
+ "nullable": true,
+ "description": "List of skills possessed by the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.ArrayType": {
+ "nestedType": [
+ "string"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"array\"}"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Table"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "urn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "proposedSnapshot": {
+ "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "aspects": [
+ {
+ "com.linkedin.pegasus2avro.common.Status": {
+ "removed": false
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
+ "customProperties": {
+ "key": "value",
+ "table_type": "EXTERNAL_TABLE",
+ "is_view": "True",
+ "view_definition": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT employee_id, employee_name, skills\nFROM\n \"test_schema\".\"test_view_1\""
+ },
+ "name": "test_view_1",
+ "description": "Test table description",
+ "tags": []
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
+ "schemaName": "test_schema.test_view_1",
+ "platform": "urn:li:dataPlatform:athena",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.pegasus2avro.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "[version=2.0].[type=string].employee_id",
+ "nullable": false,
+ "description": "Unique identifier for the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=long].annual_salary",
+ "nullable": true,
+ "description": "Annual salary of the employee in USD",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "BIGINT",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"BIGINT\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=string].employee_name",
+ "nullable": false,
+ "description": "Full name of the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history",
+ "nullable": true,
+ "description": "Job history map: year to details (company, role)",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.MapType": {
+ "keyType": "string",
+ "valueType": "record"
+ }
+ }
+ },
+ "nativeDataType": "MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=int].year",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INTEGER",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"INTEGER\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].company",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].role",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=long].department_budgets",
+ "nullable": true,
+ "description": "Map of department names to their respective budgets",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.MapType": {
+ "keyType": "string",
+ "valueType": "long"
+ }
+ }
+ },
+ "nativeDataType": "MapType(String(), BIGINT())",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"MapType(String(), BIGINT())\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].skills",
+ "nullable": true,
+ "description": "List of skills possessed by the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.ArrayType": {
+ "nestedType": [
+ "string"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"array\"}"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "View"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "viewProperties",
+ "aspect": {
+ "json": {
+ "materialized": false,
+ "viewLogic": "CREATE VIEW \"test_schema\".test_view_1 AS\nSELECT *\nFROM\n \"test_schema\".\"test_table\"",
+ "viewLanguage": "SQL"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "urn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "proposedSnapshot": {
+ "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)",
+ "aspects": [
+ {
+ "com.linkedin.pegasus2avro.common.Status": {
+ "removed": false
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
+ "customProperties": {
+ "key": "value",
+ "table_type": "EXTERNAL_TABLE",
+ "is_view": "True",
+ "view_definition": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT employee_id, employee_name, skills\nFROM\n \"test_schema\".\"test_view_1\""
+ },
+ "name": "test_view_2",
+ "description": "Test table description",
+ "tags": []
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
+ "schemaName": "test_schema.test_view_2",
+ "platform": "urn:li:dataPlatform:athena",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.pegasus2avro.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "[version=2.0].[type=string].employee_id",
+ "nullable": false,
+ "description": "Unique identifier for the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=long].annual_salary",
+ "nullable": true,
+ "description": "Annual salary of the employee in USD",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "BIGINT",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"BIGINT\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=string].employee_name",
+ "nullable": false,
+ "description": "Full name of the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history",
+ "nullable": true,
+ "description": "Job history map: year to details (company, role)",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.MapType": {
+ "keyType": "string",
+ "valueType": "record"
+ }
+ }
+ },
+ "nativeDataType": "MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=int].year",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "INTEGER",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"INTEGER\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].company",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].role",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "VARCHAR",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=map].[type=long].department_budgets",
+ "nullable": true,
+ "description": "Map of department names to their respective budgets",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.MapType": {
+ "keyType": "string",
+ "valueType": "long"
+ }
+ }
+ },
+ "nativeDataType": "MapType(String(), BIGINT())",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"MapType(String(), BIGINT())\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].skills",
+ "nullable": true,
+ "description": "List of skills possessed by the employee",
+ "type": {
+ "type": {
+ "com.linkedin.pegasus2avro.schema.ArrayType": {
+ "nestedType": [
+ "string"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array",
+ "recursive": false,
+ "isPartOfKey": false,
+ "isPartitioningKey": false,
+ "jsonProps": "{\"native_data_type\": \"array\"}"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "View"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "viewProperties",
+ "aspect": {
+ "json": {
+ "materialized": false,
+ "viewLogic": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT employee_id, employee_name, skills\nFROM\n \"test_schema\".\"test_view_1\"",
+ "viewLanguage": "SQL"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67",
+ "urn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1671098400000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD)",
+ "type": "COPY"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),employee_id)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),annual_salary)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),annual_salary)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),employee_name)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_name)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),job_history)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),job_history)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),department_budgets)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),department_budgets)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),skills)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),skills)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1671098400000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)",
+ "type": "VIEW",
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_id)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),annual_salary)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),annual_salary)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_name)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),job_history)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),job_history)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),department_budgets)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),department_budgets)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),skills)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "queryProperties",
+ "aspect": {
+ "json": {
+ "statement": {
+ "value": "CREATE VIEW \"test_schema\".test_view_1 AS\nSELECT\n *\nFROM \"test_schema\".\"test_table\"",
+ "language": "SQL"
+ },
+ "source": "SYSTEM",
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "lastModified": {
+ "time": 1671098400000,
+ "actor": "urn:li:corpuser:_ingestion"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "querySubjects",
+ "aspect": {
+ "json": {
+ "subjects": [
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),annual_salary)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),department_budgets)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_id)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_name)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),job_history)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),skills)"
+ },
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),annual_salary)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),job_history)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),department_budgets)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:athena"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": {
+ "upstreams": [
+ {
+ "auditStamp": {
+ "time": 1671098400000,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)",
+ "type": "VIEW",
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29"
+ }
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_id)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_name)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29"
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),skills)"
+ ],
+ "confidenceScore": 0.9,
+ "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "queryProperties",
+ "aspect": {
+ "json": {
+ "statement": {
+ "value": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT\n employee_id,\n employee_name,\n skills\nFROM \"test_schema\".\"test_view_1\"",
+ "language": "SQL"
+ },
+ "source": "SYSTEM",
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:_ingestion"
+ },
+ "lastModified": {
+ "time": 1671098400000,
+ "actor": "urn:li:corpuser:_ingestion"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "querySubjects",
+ "aspect": {
+ "json": {
+ "subjects": [
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)"
+ },
+ {
+ "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_id)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_name)"
+ },
+ {
+ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),skills)"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:athena"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "query",
+ "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1671098400000,
+ "runId": "athena-test",
+ "lastRunId": "no-run-id-provided"
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/athena/test_athena_source.py b/metadata-ingestion/tests/integration/athena/test_athena_source.py
new file mode 100644
index 0000000000000..56e7cbe6b3e2d
--- /dev/null
+++ b/metadata-ingestion/tests/integration/athena/test_athena_source.py
@@ -0,0 +1,163 @@
+from unittest.mock import MagicMock, patch
+
+from freezegun import freeze_time
+from sqlalchemy import ARRAY, BIGINT, INTEGER, String
+from sqlalchemy_bigquery import STRUCT
+
+from datahub.ingestion.run.pipeline import Pipeline
+from datahub.ingestion.source.aws.s3_util import make_s3_urn
+from datahub.ingestion.source.sql.athena import AthenaSource
+from datahub.utilities.sqlalchemy_type_converter import MapType
+from tests.test_helpers import ( # Ensure mce_helpers is available for validation.
+ mce_helpers,
+)
+
+FROZEN_TIME = "2022-12-15 10:00:00"
+
+
+@freeze_time(FROZEN_TIME)
+def test_athena_source_ingestion(pytestconfig, tmp_path):
+ """Test Athena source ingestion and generate MCP JSON file for validation."""
+ output_file_name = "athena_mce_output.json"
+ golden_file_name = "athena_mce_golden.json"
+ test_resources_dir = pytestconfig.rootpath / "tests/integration/athena"
+
+ # Mock dependencies
+ with patch.object(
+ AthenaSource, "get_inspectors"
+ ) as mock_get_inspectors, patch.object(
+ AthenaSource, "get_table_properties"
+ ) as mock_get_table_properties:
+ # Mock engine and inspectors
+ mock_inspector = MagicMock()
+ mock_get_inspectors.return_value = [mock_inspector]
+ mock_engine_instance = MagicMock()
+ mock_engine_instance.url.database = ""
+ mock_inspector.engine = mock_engine_instance
+
+ # Mock schema and table names
+ mock_inspector.get_schema_names.return_value = ["test_schema"]
+ mock_inspector.get_table_names.return_value = ["test_table"]
+ mock_inspector.get_view_names.return_value = ["test_view_1", "test_view_2"]
+
+ # Mock view definitions
+ def mock_get_view_definition(view_name, schema):
+ if view_name == "test_view_1":
+ return (
+ 'CREATE VIEW "test_schema".test_view_1 AS\n'
+ "SELECT *\n"
+ "FROM\n"
+ ' "test_schema"."test_table"'
+ )
+ elif view_name == "test_view_2":
+ return (
+ 'CREATE VIEW "test_schema".test_view_2 AS\n'
+ "SELECT employee_id, employee_name, skills\n"
+ "FROM\n"
+ ' "test_schema"."test_view_1"'
+ )
+ return ""
+
+ mock_inspector.get_view_definition.side_effect = mock_get_view_definition
+
+ mock_inspector.get_columns.return_value = [
+ {
+ "name": "employee_id",
+ "type": String(),
+ "nullable": False,
+ "default": None,
+ "autoincrement": False,
+ "comment": "Unique identifier for the employee",
+ "dialect_options": {"awsathena_partition": None},
+ },
+ {
+ "name": "annual_salary",
+ "type": BIGINT(),
+ "nullable": True,
+ "default": None,
+ "autoincrement": False,
+ "comment": "Annual salary of the employee in USD",
+ "dialect_options": {"awsathena_partition": None},
+ },
+ {
+ "name": "employee_name",
+ "type": String(),
+ "nullable": False,
+ "default": None,
+ "autoincrement": False,
+ "comment": "Full name of the employee",
+ "dialect_options": {"awsathena_partition": None},
+ },
+ {
+ "name": "job_history",
+ "type": MapType(
+ String(), STRUCT(year=INTEGER(), company=String(), role=String())
+ ),
+ "nullable": True,
+ "default": None,
+ "autoincrement": False,
+ "comment": "Job history map: year to details (company, role)",
+ "dialect_options": {"awsathena_partition": None},
+ },
+ {
+ "name": "department_budgets",
+ "type": MapType(String(), BIGINT()),
+ "nullable": True,
+ "default": None,
+ "autoincrement": False,
+ "comment": "Map of department names to their respective budgets",
+ "dialect_options": {"awsathena_partition": None},
+ },
+ {
+ "name": "skills",
+ "type": ARRAY(String()),
+ "nullable": True,
+ "default": None,
+ "autoincrement": False,
+ "comment": "List of skills possessed by the employee",
+ "dialect_options": {"awsathena_partition": None},
+ },
+ ]
+ # Mock table properties
+ mock_get_table_properties.return_value = (
+ "Test table description",
+ {"key": "value", "table_type": "EXTERNAL_TABLE"},
+ make_s3_urn("s3://test-bucket/test_table", "PROD"),
+ )
+
+ # Define the pipeline configuration
+ config_dict = {
+ "run_id": "athena-test",
+ "source": {
+ "type": "athena",
+ "config": {
+ "aws_region": "us-east-1",
+ "work_group": "primary",
+ "query_result_location": "s3://athena-query-results/",
+ "catalog_name": "awsdatacatalog",
+ "include_views": True,
+ "include_tables": True,
+ "profiling": {
+ "enabled": False,
+ },
+ },
+ },
+ "sink": {
+ "type": "file",
+ "config": {
+ "filename": f"{tmp_path}/{output_file_name}",
+ },
+ },
+ }
+
+ # Create and run the pipeline
+ pipeline = Pipeline.create(config_dict)
+ pipeline.run()
+ pipeline.raise_from_status()
+
+ # Validate the output with the golden file
+ mce_helpers.check_golden_file(
+ pytestconfig=pytestconfig,
+ output_path=f"{tmp_path}/{output_file_name}",
+ golden_path=f"{test_resources_dir}/{golden_file_name}",
+ )
From ccf5fc708f918de84019d280bd8dc795c19f09e1 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 2 Jan 2025 11:53:31 -0500
Subject: [PATCH 3/8] chore(ingest): refactor common pytest args (#12240)
---
metadata-ingestion/build.gradle | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle
index fc1409fbed74e..ac8658bd86927 100644
--- a/metadata-ingestion/build.gradle
+++ b/metadata-ingestion/build.gradle
@@ -127,6 +127,9 @@ task lintFix(type: Exec, dependsOn: installDev) {
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
}
+def pytest_default_env = "PYTHONDEVMODE=1"
+def pytest_default_args = "--durations=30 -vv --continue-on-collection-errors"
+
task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJsonSchema']) {
// We can't enforce the coverage requirements if we run a subset of the tests.
inputs.files(project.fileTree(dir: "src/", include: "**/*.py"))
@@ -135,7 +138,7 @@ task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJso
def cvg_arg = get_coverage_args("quick")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
- "pytest ${cvg_arg} tests/unit --random-order --durations=20 -m 'not integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml"
+ "${pytest_default_env} pytest ${cvg_arg} tests/unit ${pytest_default_args} --random-order -m 'not integration' --junit-xml=junit.quick.xml"
}
task installDevTest(type: Exec, dependsOn: [install]) {
@@ -155,7 +158,7 @@ task testSingle(dependsOn: [installDevTest]) {
if (testFile != 'unknown') {
exec {
commandLine 'bash', '-c',
- "source ${venv_name}/bin/activate && pytest ${testFile}"
+ "source ${venv_name}/bin/activate && ${pytest_default_env} pytest ${testFile} ${pytest_default_args}"
}
} else {
throw new GradleException("No file provided. Use -PtestFile=")
@@ -167,25 +170,25 @@ task testIntegrationBatch0(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_args("intBatch0")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
- "pytest ${cvg_arg} --durations=50 -m 'integration_batch_0' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch0.xml"
+ "${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_0' --junit-xml=junit.integrationbatch0.xml"
}
task testIntegrationBatch1(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_args("intBatch1")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
- "pytest ${cvg_arg} --durations=50 -m 'integration_batch_1' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch1.xml"
+ "${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_1' --junit-xml=junit.integrationbatch1.xml"
}
task testIntegrationBatch2(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_args("intBatch2")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
- "pytest ${cvg_arg} --durations=20 -m 'integration_batch_2' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch2.xml"
+ "${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_2' --junit-xml=junit.integrationbatch2.xml"
}
task testFull(type: Exec, dependsOn: [installDevTest]) {
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
- "pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml"
+ "${pytest_default_env} pytest ${pytest_default_args} --junit-xml=junit.full.xml"
}
task specGen(type: Exec, dependsOn: [codegen, installDevTest]) {
From bdc34b7b35aa5b707a9b4d57d2842c8c3727b712 Mon Sep 17 00:00:00 2001
From: Pedro Silva
Date: Thu, 2 Jan 2025 17:28:10 +0000
Subject: [PATCH 4/8] fix(sample data): Update timestamps in bootstrap_mce.json
to more recent (#12257)
---
metadata-ingestion/examples/mce_files/bootstrap_mce.json | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/metadata-ingestion/examples/mce_files/bootstrap_mce.json b/metadata-ingestion/examples/mce_files/bootstrap_mce.json
index bc218e5e8c2d5..d4e3d3aa5d8c4 100644
--- a/metadata-ingestion/examples/mce_files/bootstrap_mce.json
+++ b/metadata-ingestion/examples/mce_files/bootstrap_mce.json
@@ -3394,7 +3394,7 @@
"changeType":"UPSERT",
"aspectName":"datasetProfile",
"aspect":{
- "value":"{\"timestampMillis\": 1723488954865, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}",
+ "value":"{\"timestampMillis\": 1735823280000, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}",
"contentType":"application/json"
},
"systemMetadata":null
@@ -3418,7 +3418,7 @@
"changeType":"UPSERT",
"aspectName":"operation",
"aspect":{
- "value":"{\"timestampMillis\": 1679515693000, \"operationType\": \"INSERT\", \"lastUpdatedTimestamp\": 1629097200001 }",
+ "value":"{\"timestampMillis\": 1711138093000, \"operationType\": \"INSERT\", \"lastUpdatedTimestamp\": 1629097200001 }",
"contentType":"application/json"
},
"systemMetadata":null
@@ -3584,7 +3584,7 @@
"changeType": "UPSERT",
"aspectName": "assertionRunEvent",
"aspect": {
- "value": "{\"timestampMillis\": 1675155843000, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"{\\\"category\\\": \\\"catA\\\"}\"}, \"runId\": \"2021-12-28T12:00:00Z\", \"assertionUrn\": \"urn:li:assertion:358c683782c93c2fc2bd4bdd4fdb0153\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)\", \"batchSpec\": {\"customProperties\": {\"data_asset_name\": \"data__foo1__asset\", \"datasource_name\": \"my_hive_datasource\"}, \"nativeBatchId\": \"c8f12129f2e57412eee5fb8656154d05\", \"limit\": 10}, \"status\": \"COMPLETE\", \"result\": {\"type\": \"SUCCESS\", \"nativeResults\": {}}}",
+ "value": "{\"timestampMillis\": 1730554659000, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"{\\\"category\\\": \\\"catA\\\"}\"}, \"runId\": \"2021-12-28T12:00:00Z\", \"assertionUrn\": \"urn:li:assertion:358c683782c93c2fc2bd4bdd4fdb0153\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)\", \"batchSpec\": {\"customProperties\": {\"data_asset_name\": \"data__foo1__asset\", \"datasource_name\": \"my_hive_datasource\"}, \"nativeBatchId\": \"c8f12129f2e57412eee5fb8656154d05\", \"limit\": 10}, \"status\": \"COMPLETE\", \"result\": {\"type\": \"SUCCESS\", \"nativeResults\": {}}}",
"contentType": "application/json"
},
"systemMetadata": null
From f396d8d87a6b6567874340d530bbda966fda684e Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 2 Jan 2025 15:36:07 -0500
Subject: [PATCH 5/8] refactor(sdk/patch): improve patch implementation
internals (#12253)
---
.github/workflows/airflow-plugin.yml | 4 +-
.github/workflows/metadata-ingestion.yml | 9 +-
.github/workflows/prefect-plugin.yml | 4 +-
.../src/datahub/emitter/mce_builder.py | 6 +-
.../src/datahub/emitter/mcp_patch_builder.py | 48 +++-
.../specific/aspect_helpers/__init__.py | 0
.../aspect_helpers/custom_properties.py | 79 ++++++
.../specific/aspect_helpers/ownership.py | 67 ++++++
.../aspect_helpers/structured_properties.py | 72 ++++++
.../datahub/specific/aspect_helpers/tags.py | 42 ++++
.../datahub/specific/aspect_helpers/terms.py | 43 ++++
.../src/datahub/specific/chart.py | 212 +++-------------
.../src/datahub/specific/custom_properties.py | 37 ---
.../src/datahub/specific/dashboard.py | 227 +++---------------
.../src/datahub/specific/datajob.py | 223 +++--------------
.../src/datahub/specific/dataproduct.py | 110 ++-------
.../src/datahub/specific/dataset.py | 181 ++++----------
.../src/datahub/specific/form.py | 44 +---
.../src/datahub/specific/ownership.py | 48 ----
.../datahub/specific/structured_properties.py | 53 ----
.../datahub/specific/structured_property.py | 18 +-
21 files changed, 535 insertions(+), 992 deletions(-)
create mode 100644 metadata-ingestion/src/datahub/specific/aspect_helpers/__init__.py
create mode 100644 metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py
create mode 100644 metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py
create mode 100644 metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py
create mode 100644 metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py
create mode 100644 metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py
delete mode 100644 metadata-ingestion/src/datahub/specific/custom_properties.py
delete mode 100644 metadata-ingestion/src/datahub/specific/ownership.py
delete mode 100644 metadata-ingestion/src/datahub/specific/structured_properties.py
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
index 26fcceb8aeab7..b824a21be63f8 100644
--- a/.github/workflows/airflow-plugin.yml
+++ b/.github/workflows/airflow-plugin.yml
@@ -84,8 +84,8 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./build/coverage-reports/
fail_ci_if_error: false
- flags: airflow,airflow-${{ matrix.extra_pip_extras }}
- name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }}
+ flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }}
+ name: pytest-airflow
verbose: true
event-file:
diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
index 106cba1473982..f4d87b361b5ed 100644
--- a/.github/workflows/metadata-ingestion.yml
+++ b/.github/workflows/metadata-ingestion.yml
@@ -41,9 +41,6 @@ jobs:
"testIntegrationBatch1",
"testIntegrationBatch2",
]
- include:
- - python-version: "3.8"
- - python-version: "3.11"
fail-fast: false
steps:
- name: Free up disk space
@@ -92,14 +89,14 @@ jobs:
**/junit.*.xml
!**/binary/**
- name: Upload coverage to Codecov
- if: ${{ always() && matrix.python-version == '3.10' }}
+ if: ${{ always() }}
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./build/coverage-reports/
fail_ci_if_error: false
- flags: pytest-${{ matrix.command }}
- name: pytest-${{ matrix.python-version }}-${{ matrix.command }}
+ flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }}
+ name: pytest-ingestion
verbose: true
event-file:
diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml
index d77142a1f00de..879df032409f2 100644
--- a/.github/workflows/prefect-plugin.yml
+++ b/.github/workflows/prefect-plugin.yml
@@ -67,8 +67,8 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./build/coverage-reports/
fail_ci_if_error: false
- flags: prefect,prefect-${{ matrix.python-version }}
- name: pytest-prefect-${{ matrix.python-version }}
+ flags: prefect-${{ matrix.python-version }}
+ name: pytest-prefect
verbose: true
event-file:
diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py
index 110624aa61cb8..f095fffbaea6b 100644
--- a/metadata-ingestion/src/datahub/emitter/mce_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py
@@ -24,6 +24,7 @@
import typing_inspect
from avrogen.dict_wrapper import DictWrapper
+from typing_extensions import assert_never
from datahub.emitter.enum_helpers import get_enum_options
from datahub.metadata.schema_classes import (
@@ -269,9 +270,8 @@ def make_owner_urn(owner: str, owner_type: OwnerType) -> str:
return make_user_urn(owner)
elif owner_type == OwnerType.GROUP:
return make_group_urn(owner)
- # This should pretty much never happen.
- # TODO: With Python 3.11, we can use typing.assert_never() here.
- return f"urn:li:{owner_type.value}:{owner}"
+ else:
+ assert_never(owner_type)
def make_ownership_type_urn(type: str) -> str:
diff --git a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py
index 1ed8ce1d5a615..17026a4114c12 100644
--- a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py
@@ -2,7 +2,19 @@
import time
from collections import defaultdict
from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Sequence, Union
+from typing import (
+ Any,
+ Dict,
+ List,
+ Literal,
+ Optional,
+ Protocol,
+ Tuple,
+ Union,
+ runtime_checkable,
+)
+
+from typing_extensions import LiteralString
from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
from datahub.emitter.serialization_helper import pre_json_transform
@@ -19,25 +31,36 @@
from datahub.utilities.urns.urn import guess_entity_type
+@runtime_checkable
+class SupportsToObj(Protocol):
+ def to_obj(self) -> Any:
+ ...
+
+
def _recursive_to_obj(obj: Any) -> Any:
if isinstance(obj, list):
return [_recursive_to_obj(v) for v in obj]
- elif hasattr(obj, "to_obj"):
+ elif isinstance(obj, SupportsToObj):
return obj.to_obj()
else:
return obj
+PatchPath = Tuple[Union[LiteralString, Urn], ...]
+PatchOp = Literal["add", "remove", "replace"]
+
+
@dataclass
-class _Patch:
- op: str # one of ['add', 'remove', 'replace']; we don't support move, copy or test
- path: str
+class _Patch(SupportsToObj):
+ op: PatchOp
+ path: PatchPath
value: Any
def to_obj(self) -> Dict:
+ quoted_path = "/" + "/".join(MetadataPatchProposal.quote(p) for p in self.path)
return {
"op": self.op,
- "path": self.path,
+ "path": quoted_path,
"value": _recursive_to_obj(self.value),
}
@@ -63,15 +86,16 @@ def __init__(
# Json Patch quoting based on https://jsonpatch.com/#json-pointer
@classmethod
- def quote(cls, value: str) -> str:
- return value.replace("~", "~0").replace("/", "~1")
+ def quote(cls, value: Union[str, Urn]) -> str:
+ return str(value).replace("~", "~0").replace("/", "~1")
def _add_patch(
- self, aspect_name: str, op: str, path: Union[str, Sequence[str]], value: Any
+ self,
+ aspect_name: str,
+ op: PatchOp,
+ path: PatchPath,
+ value: Any,
) -> None:
- if not isinstance(path, str):
- path = "/" + "/".join(self.quote(p) for p in path)
-
# TODO: Validate that aspectName is a valid aspect for this entityType
self.patches[aspect_name].append(_Patch(op, path, value))
diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/__init__.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py
new file mode 100644
index 0000000000000..1fd1585a91358
--- /dev/null
+++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py
@@ -0,0 +1,79 @@
+from abc import abstractmethod
+from typing import Dict, Optional, Tuple
+
+from typing_extensions import Self
+
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
+
+
+class HasCustomPropertiesPatch(MetadataPatchProposal):
+ @classmethod
+ @abstractmethod
+ def _custom_properties_location(self) -> Tuple[str, PatchPath]:
+ ...
+
+ def add_custom_property(self, key: str, value: str) -> Self:
+ """Add a custom property to the entity.
+
+ Args:
+ key: The key of the custom property.
+ value: The value of the custom property.
+
+ Returns:
+ The patch builder instance.
+ """
+ aspect_name, path = self._custom_properties_location()
+ self._add_patch(
+ aspect_name,
+ "add",
+ path=(*path, key),
+ value=value,
+ )
+ return self
+
+ def add_custom_properties(
+ self, custom_properties: Optional[Dict[str, str]] = None
+ ) -> Self:
+ if custom_properties is not None:
+ for key, value in custom_properties.items():
+ self.add_custom_property(key, value)
+ return self
+
+ def remove_custom_property(self, key: str) -> Self:
+ """Remove a custom property from the entity.
+
+ Args:
+ key: The key of the custom property to remove.
+
+ Returns:
+ The patch builder instance.
+ """
+ aspect_name, path = self._custom_properties_location()
+ self._add_patch(
+ aspect_name,
+ "remove",
+ path=(*path, key),
+ value={},
+ )
+ return self
+
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> Self:
+ """Sets the custom properties of the entity.
+
+ This method replaces all existing custom properties with the given dictionary.
+
+ Args:
+ custom_properties: A dictionary containing the custom properties to be set.
+
+ Returns:
+ The patch builder instance.
+ """
+
+ aspect_name, path = self._custom_properties_location()
+ self._add_patch(
+ aspect_name,
+ "add",
+ path=path,
+ value=custom_properties,
+ )
+ return self
diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py
new file mode 100644
index 0000000000000..1e2c789c7def3
--- /dev/null
+++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py
@@ -0,0 +1,67 @@
+from typing import List, Optional
+
+from typing_extensions import Self
+
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.metadata.schema_classes import (
+ OwnerClass,
+ OwnershipClass,
+ OwnershipTypeClass,
+)
+
+
+class HasOwnershipPatch(MetadataPatchProposal):
+ def add_owner(self, owner: OwnerClass) -> Self:
+ """Add an owner to the entity.
+
+ Args:
+ owner: The Owner object to add.
+
+ Returns:
+ The patch builder instance.
+ """
+ self._add_patch(
+ OwnershipClass.ASPECT_NAME,
+ "add",
+ path=("owners", owner.owner, str(owner.type)),
+ value=owner,
+ )
+ return self
+
+ def remove_owner(
+ self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
+ ) -> Self:
+ """Remove an owner from the entity.
+
+ If owner_type is not provided, the owner will be removed regardless of ownership type.
+
+ Args:
+ owner: The owner to remove.
+ owner_type: The ownership type of the owner (optional).
+
+ Returns:
+ The patch builder instance.
+ """
+ self._add_patch(
+ OwnershipClass.ASPECT_NAME,
+ "remove",
+ path=("owners", owner) + ((str(owner_type),) if owner_type else ()),
+ value=owner,
+ )
+ return self
+
+ def set_owners(self, owners: List[OwnerClass]) -> Self:
+ """Set the owners of the entity.
+
+ This will effectively replace all existing owners with the new list - it doesn't really patch things.
+
+ Args:
+ owners: The list of owners to set.
+
+ Returns:
+ The patch builder instance.
+ """
+ self._add_patch(
+ OwnershipClass.ASPECT_NAME, "add", path=("owners",), value=owners
+ )
+ return self
diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py
new file mode 100644
index 0000000000000..48050bbad8e50
--- /dev/null
+++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py
@@ -0,0 +1,72 @@
+from typing import List, Union
+
+from typing_extensions import Self
+
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.metadata.schema_classes import (
+ StructuredPropertiesClass,
+ StructuredPropertyValueAssignmentClass,
+)
+from datahub.utilities.urns.structured_properties_urn import (
+ make_structured_property_urn,
+)
+
+
+class HasStructuredPropertiesPatch(MetadataPatchProposal):
+ def set_structured_property(
+ self, key: str, value: Union[str, float, List[Union[str, float]]]
+ ) -> Self:
+ """Add or update a structured property.
+
+ Args:
+ key: the name of the property (either bare or urn form)
+ value: the value of the property (for multi-valued properties, this can be a list)
+
+ Returns:
+ The patch builder instance.
+ """
+ self.remove_structured_property(key)
+ self.add_structured_property(key, value)
+ return self
+
+ def remove_structured_property(self, key: str) -> Self:
+ """Remove a structured property.
+
+ Args:
+ key: the name of the property (either bare or urn form)
+
+ Returns:
+ The patch builder instance.
+ """
+
+ self._add_patch(
+ StructuredPropertiesClass.ASPECT_NAME,
+ "remove",
+ path=("properties", make_structured_property_urn(key)),
+ value={},
+ )
+ return self
+
+ def add_structured_property(
+ self, key: str, value: Union[str, float, List[Union[str, float]]]
+ ) -> Self:
+ """Add a structured property.
+
+ Args:
+ key: the name of the property (either bare or urn form)
+ value: the value of the property (for multi-valued properties, this value will be appended to the list)
+
+ Returns:
+ The patch builder instance.
+ """
+
+ self._add_patch(
+ StructuredPropertiesClass.ASPECT_NAME,
+ "add",
+ path=("properties", make_structured_property_urn(key)),
+ value=StructuredPropertyValueAssignmentClass(
+ propertyUrn=make_structured_property_urn(key),
+ values=value if isinstance(value, list) else [value],
+ ),
+ )
+ return self
diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py
new file mode 100644
index 0000000000000..afbc9115ca6e2
--- /dev/null
+++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py
@@ -0,0 +1,42 @@
+from typing import Union
+
+from typing_extensions import Self
+
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.metadata.schema_classes import (
+ GlobalTagsClass as GlobalTags,
+ TagAssociationClass as Tag,
+)
+from datahub.metadata.urns import TagUrn, Urn
+
+
+class HasTagsPatch(MetadataPatchProposal):
+ def add_tag(self, tag: Tag) -> Self:
+ """Adds a tag to the entity.
+
+ Args:
+ tag: The Tag object representing the tag to be added.
+
+ Returns:
+ The patch builder instance.
+ """
+
+ # TODO: Make this support raw strings, in addition to Tag objects.
+ self._add_patch(
+ GlobalTags.ASPECT_NAME, "add", path=("tags", tag.tag), value=tag
+ )
+ return self
+
+ def remove_tag(self, tag: Union[str, Urn]) -> Self:
+ """Removes a tag from the entity.
+
+ Args:
+ tag: The tag to remove, specified as a string or Urn object.
+
+ Returns:
+ The patch builder instance.
+ """
+ if isinstance(tag, str) and not tag.startswith("urn:li:tag:"):
+ tag = TagUrn.create_from_id(tag)
+ self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=("tags", tag), value={})
+ return self
diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py
new file mode 100644
index 0000000000000..ae199124372b4
--- /dev/null
+++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py
@@ -0,0 +1,43 @@
+from typing import Union
+
+from typing_extensions import Self
+
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.metadata.schema_classes import (
+ GlossaryTermAssociationClass as Term,
+ GlossaryTermsClass,
+)
+from datahub.metadata.urns import GlossaryTermUrn, Urn
+
+
+class HasTermsPatch(MetadataPatchProposal):
+ def add_term(self, term: Term) -> Self:
+ """Adds a glossary term to the entity.
+
+ Args:
+ term: The Term object representing the glossary term to be added.
+
+ Returns:
+ The patch builder instance.
+ """
+ # TODO: Make this support raw strings, in addition to Term objects.
+ self._add_patch(
+ GlossaryTermsClass.ASPECT_NAME, "add", path=("terms", term.urn), value=term
+ )
+ return self
+
+ def remove_term(self, term: Union[str, Urn]) -> Self:
+ """Removes a glossary term from the entity.
+
+ Args:
+ term: The term to remove, specified as a string or Urn object.
+
+ Returns:
+ The patch builder instance.
+ """
+ if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"):
+ term = GlossaryTermUrn(term)
+ self._add_patch(
+ GlossaryTermsClass.ASPECT_NAME, "remove", path=("terms", term), value={}
+ )
+ return self
diff --git a/metadata-ingestion/src/datahub/specific/chart.py b/metadata-ingestion/src/datahub/specific/chart.py
index 104a7c21a07e2..f44a2ffc0d68a 100644
--- a/metadata-ingestion/src/datahub/specific/chart.py
+++ b/metadata-ingestion/src/datahub/specific/chart.py
@@ -1,28 +1,29 @@
-from typing import Dict, List, Optional, Union
+from typing import List, Optional, Tuple, Union
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
from datahub.metadata.schema_classes import (
AccessLevelClass,
ChangeAuditStampsClass,
ChartInfoClass as ChartInfo,
ChartTypeClass,
EdgeClass as Edge,
- GlobalTagsClass as GlobalTags,
- GlossaryTermAssociationClass as Term,
- GlossaryTermsClass as GlossaryTerms,
KafkaAuditHeaderClass,
- OwnerClass as Owner,
- OwnershipTypeClass,
SystemMetadataClass,
- TagAssociationClass as Tag,
)
-from datahub.specific.custom_properties import CustomPropertiesPatchHelper
-from datahub.specific.ownership import OwnershipPatchHelper
-from datahub.utilities.urns.tag_urn import TagUrn
+from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
+from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
+from datahub.specific.aspect_helpers.tags import HasTagsPatch
+from datahub.specific.aspect_helpers.terms import HasTermsPatch
from datahub.utilities.urns.urn import Urn
-class ChartPatchBuilder(MetadataPatchProposal):
+class ChartPatchBuilder(
+ HasOwnershipPatch,
+ HasCustomPropertiesPatch,
+ HasTagsPatch,
+ HasTermsPatch,
+ MetadataPatchProposal,
+):
def __init__(
self,
urn: str,
@@ -40,55 +41,10 @@ def __init__(
super().__init__(
urn, system_metadata=system_metadata, audit_header=audit_header
)
- self.custom_properties_patch_helper = CustomPropertiesPatchHelper(
- self, ChartInfo.ASPECT_NAME
- )
- self.ownership_patch_helper = OwnershipPatchHelper(self)
-
- def add_owner(self, owner: Owner) -> "ChartPatchBuilder":
- """
- Adds an owner to the ChartPatchBuilder.
-
- Args:
- owner: The Owner object to add.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- self.ownership_patch_helper.add_owner(owner)
- return self
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "ChartPatchBuilder":
- """
- Removes an owner from the ChartPatchBuilder.
-
- Args:
- owner: The owner to remove.
- owner_type: The ownership type of the owner (optional).
-
- Returns:
- The ChartPatchBuilder instance.
-
- Notes:
- `owner_type` is optional.
- """
- self.ownership_patch_helper.remove_owner(owner, owner_type)
- return self
-
- def set_owners(self, owners: List[Owner]) -> "ChartPatchBuilder":
- """
- Sets the owners of the ChartPatchBuilder.
-
- Args:
- owners: A list of Owner objects.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- self.ownership_patch_helper.set_owners(owners)
- return self
+ @classmethod
+ def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
+ return ChartInfo.ASPECT_NAME, ("customProperties",)
def add_input_edge(self, input: Union[Edge, Urn, str]) -> "ChartPatchBuilder":
"""
@@ -120,7 +76,7 @@ def add_input_edge(self, input: Union[Edge, Urn, str]) -> "ChartPatchBuilder":
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path=f"/inputEdges/{self.quote(input_urn)}",
+ path=("inputEdges", input_urn),
value=input_urn,
)
return self
@@ -138,7 +94,7 @@ def remove_input_edge(self, input: Union[str, Urn]) -> "ChartPatchBuilder":
self._add_patch(
ChartInfo.ASPECT_NAME,
"remove",
- path=f"/inputEdges/{self.quote(str(input))}",
+ path=("inputEdges", str(input)),
value={},
)
return self
@@ -159,129 +115,17 @@ def set_input_edges(self, inputs: List[Edge]) -> "ChartPatchBuilder":
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/inputEdges",
+ path=("inputEdges",),
value=inputs,
)
return self
- def add_tag(self, tag: Tag) -> "ChartPatchBuilder":
- """
- Adds a tag to the ChartPatchBuilder.
-
- Args:
- tag: The Tag object representing the tag to be added.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- self._add_patch(
- GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag
- )
- return self
-
- def remove_tag(self, tag: Union[str, Urn]) -> "ChartPatchBuilder":
- """
- Removes a tag from the ChartPatchBuilder.
-
- Args:
- tag: The tag to remove, specified as a string or Urn object.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- if isinstance(tag, str) and not tag.startswith("urn:li:tag:"):
- tag = TagUrn.create_from_id(tag)
- self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={})
- return self
-
- def add_term(self, term: Term) -> "ChartPatchBuilder":
- """
- Adds a glossary term to the ChartPatchBuilder.
-
- Args:
- term: The Term object representing the glossary term to be added.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term
- )
- return self
-
- def remove_term(self, term: Union[str, Urn]) -> "ChartPatchBuilder":
- """
- Removes a glossary term from the ChartPatchBuilder.
-
- Args:
- term: The term to remove, specified as a string or Urn object.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"):
- term = "urn:li:glossaryTerm:" + term
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={}
- )
- return self
-
- def set_custom_properties(
- self, custom_properties: Dict[str, str]
- ) -> "ChartPatchBuilder":
- """
- Sets the custom properties for the ChartPatchBuilder.
-
- Args:
- custom_properties: A dictionary containing the custom properties to be set.
-
- Returns:
- The ChartPatchBuilder instance.
-
- Notes:
- This method replaces all existing custom properties with the given dictionary.
- """
- self._add_patch(
- ChartInfo.ASPECT_NAME,
- "add",
- path="/customProperties",
- value=custom_properties,
- )
- return self
-
- def add_custom_property(self, key: str, value: str) -> "ChartPatchBuilder":
- """
- Adds a custom property to the ChartPatchBuilder.
-
- Args:
- key: The key of the custom property.
- value: The value of the custom property.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- self.custom_properties_patch_helper.add_property(key, value)
- return self
-
- def remove_custom_property(self, key: str) -> "ChartPatchBuilder":
- """
- Removes a custom property from the ChartPatchBuilder.
-
- Args:
- key: The key of the custom property to remove.
-
- Returns:
- The ChartPatchBuilder instance.
- """
- self.custom_properties_patch_helper.remove_property(key)
- return self
-
def set_title(self, title: str) -> "ChartPatchBuilder":
assert title, "ChartInfo title should not be None"
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/title",
+ path=("title",),
value=title,
)
@@ -292,7 +136,7 @@ def set_description(self, description: str) -> "ChartPatchBuilder":
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/description",
+ path=("description",),
value=description,
)
@@ -303,7 +147,7 @@ def set_last_refreshed(self, last_refreshed: Optional[int]) -> "ChartPatchBuilde
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/lastRefreshed",
+ path=("lastRefreshed",),
value=last_refreshed,
)
@@ -316,7 +160,7 @@ def set_last_modified(
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/lastModified",
+ path=("lastModified",),
value=last_modified,
)
@@ -327,7 +171,7 @@ def set_external_url(self, external_url: Optional[str]) -> "ChartPatchBuilder":
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/externalUrl",
+ path=("externalUrl",),
value=external_url,
)
return self
@@ -337,7 +181,7 @@ def set_chart_url(self, dashboard_url: Optional[str]) -> "ChartPatchBuilder":
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/chartUrl",
+ path=("chartUrl",),
value=dashboard_url,
)
@@ -350,7 +194,7 @@ def set_type(
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/type",
+ path=("type",),
value=type,
)
@@ -363,7 +207,7 @@ def set_access(
self._add_patch(
ChartInfo.ASPECT_NAME,
"add",
- path="/access",
+ path=("access",),
value=access,
)
@@ -375,7 +219,7 @@ def add_inputs(self, input_urns: Optional[List[str]]) -> "ChartPatchBuilder":
self._add_patch(
aspect_name=ChartInfo.ASPECT_NAME,
op="add",
- path=f"/inputs/{urn}",
+ path=("inputs", urn),
value=urn,
)
diff --git a/metadata-ingestion/src/datahub/specific/custom_properties.py b/metadata-ingestion/src/datahub/specific/custom_properties.py
deleted file mode 100644
index d399a448cc0c2..0000000000000
--- a/metadata-ingestion/src/datahub/specific/custom_properties.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Generic, TypeVar
-
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
-
-_Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
-
-
-class CustomPropertiesPatchHelper(Generic[_Parent]):
- def __init__(
- self,
- parent: _Parent,
- aspect_name: str,
- ) -> None:
- self.aspect_name = aspect_name
- self._parent = parent
- self.aspect_field = "customProperties"
-
- def parent(self) -> _Parent:
- return self._parent
-
- def add_property(self, key: str, value: str) -> "CustomPropertiesPatchHelper":
- self._parent._add_patch(
- self.aspect_name,
- "add",
- path=f"/{self.aspect_field}/{key}",
- value=value,
- )
- return self
-
- def remove_property(self, key: str) -> "CustomPropertiesPatchHelper":
- self._parent._add_patch(
- self.aspect_name,
- "remove",
- path=f"/{self.aspect_field}/{key}",
- value={},
- )
- return self
diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py
index da5abbfd1dc12..515fcf0c6da95 100644
--- a/metadata-ingestion/src/datahub/specific/dashboard.py
+++ b/metadata-ingestion/src/datahub/specific/dashboard.py
@@ -1,27 +1,28 @@
-from typing import Dict, List, Optional, Union
+from typing import List, Optional, Tuple, Union
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
from datahub.metadata.schema_classes import (
AccessLevelClass,
ChangeAuditStampsClass,
DashboardInfoClass as DashboardInfo,
EdgeClass as Edge,
- GlobalTagsClass as GlobalTags,
- GlossaryTermAssociationClass as Term,
- GlossaryTermsClass as GlossaryTerms,
KafkaAuditHeaderClass,
- OwnerClass as Owner,
- OwnershipTypeClass,
SystemMetadataClass,
- TagAssociationClass as Tag,
)
-from datahub.specific.custom_properties import CustomPropertiesPatchHelper
-from datahub.specific.ownership import OwnershipPatchHelper
-from datahub.utilities.urns.tag_urn import TagUrn
+from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
+from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
+from datahub.specific.aspect_helpers.tags import HasTagsPatch
+from datahub.specific.aspect_helpers.terms import HasTermsPatch
from datahub.utilities.urns.urn import Urn
-class DashboardPatchBuilder(MetadataPatchProposal):
+class DashboardPatchBuilder(
+ HasOwnershipPatch,
+ HasCustomPropertiesPatch,
+ HasTagsPatch,
+ HasTermsPatch,
+ MetadataPatchProposal,
+):
def __init__(
self,
urn: str,
@@ -39,55 +40,10 @@ def __init__(
super().__init__(
urn, system_metadata=system_metadata, audit_header=audit_header
)
- self.custom_properties_patch_helper = CustomPropertiesPatchHelper(
- self, DashboardInfo.ASPECT_NAME
- )
- self.ownership_patch_helper = OwnershipPatchHelper(self)
-
- def add_owner(self, owner: Owner) -> "DashboardPatchBuilder":
- """
- Adds an owner to the DashboardPatchBuilder.
-
- Args:
- owner: The Owner object to add.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- self.ownership_patch_helper.add_owner(owner)
- return self
-
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "DashboardPatchBuilder":
- """
- Removes an owner from the DashboardPatchBuilder.
-
- Args:
- owner: The owner to remove.
- owner_type: The ownership type of the owner (optional).
-
- Returns:
- The DashboardPatchBuilder instance.
-
- Notes:
- `owner_type` is optional.
- """
- self.ownership_patch_helper.remove_owner(owner, owner_type)
- return self
-
- def set_owners(self, owners: List[Owner]) -> "DashboardPatchBuilder":
- """
- Sets the owners of the DashboardPatchBuilder.
- Args:
- owners: A list of Owner objects.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- self.ownership_patch_helper.set_owners(owners)
- return self
+ @classmethod
+ def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
+ return DashboardInfo.ASPECT_NAME, ("customProperties",)
def add_dataset_edge(
self, dataset: Union[Edge, Urn, str]
@@ -126,7 +82,7 @@ def add_dataset_edge(
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path=f"/datasetEdges/{self.quote(dataset_urn)}",
+ path=("datasetEdges", dataset_urn),
value=dataset_edge,
)
return self
@@ -144,7 +100,7 @@ def remove_dataset_edge(self, dataset: Union[str, Urn]) -> "DashboardPatchBuilde
self._add_patch(
DashboardInfo.ASPECT_NAME,
"remove",
- path=f"/datasetEdges/{dataset}",
+ path=("datasetEdges", dataset),
value={},
)
return self
@@ -169,7 +125,7 @@ def set_dataset_edges(self, datasets: List[Edge]) -> "DashboardPatchBuilder":
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/datasetEdges",
+ path=("datasetEdges",),
value=datasets,
)
return self
@@ -209,7 +165,7 @@ def add_chart_edge(self, chart: Union[Edge, Urn, str]) -> "DashboardPatchBuilder
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path=f"/chartEdges/{self.quote(chart_urn)}",
+ path=("chartEdges", chart_urn),
value=chart_edge,
)
return self
@@ -227,7 +183,7 @@ def remove_chart_edge(self, chart: Union[str, Urn]) -> "DashboardPatchBuilder":
self._add_patch(
DashboardInfo.ASPECT_NAME,
"remove",
- path=f"/chartEdges/{chart}",
+ path=("chartEdges", chart),
value={},
)
return self
@@ -252,129 +208,17 @@ def set_chart_edges(self, charts: List[Edge]) -> "DashboardPatchBuilder":
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/chartEdges",
+ path=("chartEdges",),
value=charts,
)
return self
- def add_tag(self, tag: Tag) -> "DashboardPatchBuilder":
- """
- Adds a tag to the DashboardPatchBuilder.
-
- Args:
- tag: The Tag object representing the tag to be added.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- self._add_patch(
- GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag
- )
- return self
-
- def remove_tag(self, tag: Union[str, Urn]) -> "DashboardPatchBuilder":
- """
- Removes a tag from the DashboardPatchBuilder.
-
- Args:
- tag: The tag to remove, specified as a string or Urn object.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- if isinstance(tag, str) and not tag.startswith("urn:li:tag:"):
- tag = TagUrn.create_from_id(tag)
- self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={})
- return self
-
- def add_term(self, term: Term) -> "DashboardPatchBuilder":
- """
- Adds a glossary term to the DashboardPatchBuilder.
-
- Args:
- term: The Term object representing the glossary term to be added.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term
- )
- return self
-
- def remove_term(self, term: Union[str, Urn]) -> "DashboardPatchBuilder":
- """
- Removes a glossary term from the DashboardPatchBuilder.
-
- Args:
- term: The term to remove, specified as a string or Urn object.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"):
- term = "urn:li:glossaryTerm:" + term
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={}
- )
- return self
-
- def set_custom_properties(
- self, custom_properties: Dict[str, str]
- ) -> "DashboardPatchBuilder":
- """
- Sets the custom properties for the DashboardPatchBuilder.
-
- Args:
- custom_properties: A dictionary containing the custom properties to be set.
-
- Returns:
- The DashboardPatchBuilder instance.
-
- Notes:
- This method replaces all existing custom properties with the given dictionary.
- """
- self._add_patch(
- DashboardInfo.ASPECT_NAME,
- "add",
- path="/customProperties",
- value=custom_properties,
- )
- return self
-
- def add_custom_property(self, key: str, value: str) -> "DashboardPatchBuilder":
- """
- Adds a custom property to the DashboardPatchBuilder.
-
- Args:
- key: The key of the custom property.
- value: The value of the custom property.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- self.custom_properties_patch_helper.add_property(key, value)
- return self
-
- def remove_custom_property(self, key: str) -> "DashboardPatchBuilder":
- """
- Removes a custom property from the DashboardPatchBuilder.
-
- Args:
- key: The key of the custom property to remove.
-
- Returns:
- The DashboardPatchBuilder instance.
- """
- self.custom_properties_patch_helper.remove_property(key)
- return self
-
def set_title(self, title: str) -> "DashboardPatchBuilder":
assert title, "DashboardInfo title should not be None"
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/title",
+ path=("title",),
value=title,
)
@@ -385,27 +229,18 @@ def set_description(self, description: str) -> "DashboardPatchBuilder":
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/description",
+ path=("description",),
value=description,
)
return self
- def add_custom_properties(
- self, custom_properties: Optional[Dict[str, str]] = None
- ) -> "DashboardPatchBuilder":
- if custom_properties:
- for key, value in custom_properties.items():
- self.custom_properties_patch_helper.add_property(key, value)
-
- return self
-
def set_external_url(self, external_url: Optional[str]) -> "DashboardPatchBuilder":
if external_url:
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/externalUrl",
+ path=("externalUrl",),
value=external_url,
)
return self
@@ -416,7 +251,7 @@ def add_charts(self, chart_urns: Optional[List[str]]) -> "DashboardPatchBuilder"
self._add_patch(
aspect_name=DashboardInfo.ASPECT_NAME,
op="add",
- path=f"/charts/{urn}",
+ path=("charts", urn),
value=urn,
)
@@ -430,7 +265,7 @@ def add_datasets(
self._add_patch(
aspect_name=DashboardInfo.ASPECT_NAME,
op="add",
- path=f"/datasets/{urn}",
+ path=("datasets", urn),
value=urn,
)
@@ -443,7 +278,7 @@ def set_dashboard_url(
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/dashboardUrl",
+ path=("dashboardUrl",),
value=dashboard_url,
)
@@ -456,7 +291,7 @@ def set_access(
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/access",
+ path=("access",),
value=access,
)
@@ -469,7 +304,7 @@ def set_last_refreshed(
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/lastRefreshed",
+ path=("lastRefreshed",),
value=last_refreshed,
)
@@ -482,7 +317,7 @@ def set_last_modified(
self._add_patch(
DashboardInfo.ASPECT_NAME,
"add",
- path="/lastModified",
+ path=("lastModified",),
value=last_modified,
)
diff --git a/metadata-ingestion/src/datahub/specific/datajob.py b/metadata-ingestion/src/datahub/specific/datajob.py
index 6ff4741b09c26..fd826c6dd59ca 100644
--- a/metadata-ingestion/src/datahub/specific/datajob.py
+++ b/metadata-ingestion/src/datahub/specific/datajob.py
@@ -1,25 +1,27 @@
-from typing import Dict, List, Optional, Union
+from typing import List, Optional, Tuple, Union
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
from datahub.metadata.schema_classes import (
DataJobInfoClass as DataJobInfo,
DataJobInputOutputClass as DataJobInputOutput,
EdgeClass as Edge,
- GlobalTagsClass as GlobalTags,
- GlossaryTermAssociationClass as Term,
- GlossaryTermsClass as GlossaryTerms,
KafkaAuditHeaderClass,
- OwnerClass as Owner,
- OwnershipTypeClass,
SystemMetadataClass,
- TagAssociationClass as Tag,
)
-from datahub.metadata.urns import SchemaFieldUrn, TagUrn, Urn
-from datahub.specific.custom_properties import CustomPropertiesPatchHelper
-from datahub.specific.ownership import OwnershipPatchHelper
-
-
-class DataJobPatchBuilder(MetadataPatchProposal):
+from datahub.metadata.urns import SchemaFieldUrn, Urn
+from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
+from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
+from datahub.specific.aspect_helpers.tags import HasTagsPatch
+from datahub.specific.aspect_helpers.terms import HasTermsPatch
+
+
+class DataJobPatchBuilder(
+ HasOwnershipPatch,
+ HasCustomPropertiesPatch,
+ HasTagsPatch,
+ HasTermsPatch,
+ MetadataPatchProposal,
+):
def __init__(
self,
urn: str,
@@ -37,55 +39,10 @@ def __init__(
super().__init__(
urn, system_metadata=system_metadata, audit_header=audit_header
)
- self.custom_properties_patch_helper = CustomPropertiesPatchHelper(
- self, DataJobInfo.ASPECT_NAME
- )
- self.ownership_patch_helper = OwnershipPatchHelper(self)
-
- def add_owner(self, owner: Owner) -> "DataJobPatchBuilder":
- """
- Adds an owner to the DataJobPatchBuilder.
-
- Args:
- owner: The Owner object to add.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- self.ownership_patch_helper.add_owner(owner)
- return self
-
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "DataJobPatchBuilder":
- """
- Removes an owner from the DataJobPatchBuilder.
-
- Args:
- owner: The owner to remove.
- owner_type: The ownership type of the owner (optional).
-
- Returns:
- The DataJobPatchBuilder instance.
-
- Notes:
- `owner_type` is optional.
- """
- self.ownership_patch_helper.remove_owner(owner, owner_type)
- return self
-
- def set_owners(self, owners: List[Owner]) -> "DataJobPatchBuilder":
- """
- Sets the owners of the DataJobPatchBuilder.
-
- Args:
- owners: A list of Owner objects.
- Returns:
- The DataJobPatchBuilder instance.
- """
- self.ownership_patch_helper.set_owners(owners)
- return self
+ @classmethod
+ def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
+ return DataJobInfo.ASPECT_NAME, ("customProperties",)
def add_input_datajob(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilder":
"""
@@ -120,7 +77,7 @@ def add_input_datajob(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilde
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path=f"/inputDatajobEdges/{self.quote(input_urn)}",
+ path=("inputDatajobEdges", input_urn),
value=input_edge,
)
return self
@@ -138,7 +95,7 @@ def remove_input_datajob(self, input: Union[str, Urn]) -> "DataJobPatchBuilder":
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"remove",
- path=f"/inputDatajobEdges/{input}",
+ path=("inputDatajobEdges", input),
value={},
)
return self
@@ -163,7 +120,7 @@ def set_input_datajobs(self, inputs: List[Edge]) -> "DataJobPatchBuilder":
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path="/inputDatajobEdges",
+ path=("inputDatajobEdges",),
value=inputs,
)
return self
@@ -201,7 +158,7 @@ def add_input_dataset(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilde
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path=f"/inputDatasetEdges/{self.quote(input_urn)}",
+ path=("inputDatasetEdges", input_urn),
value=input_edge,
)
return self
@@ -219,7 +176,7 @@ def remove_input_dataset(self, input: Union[str, Urn]) -> "DataJobPatchBuilder":
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"remove",
- path=f"/inputDatasetEdges/{self.quote(str(input))}",
+ path=("inputDatasetEdges", input),
value={},
)
return self
@@ -244,7 +201,7 @@ def set_input_datasets(self, inputs: List[Edge]) -> "DataJobPatchBuilder":
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path="/inputDatasetEdges",
+ path=("inputDatasetEdges",),
value=inputs,
)
return self
@@ -284,7 +241,7 @@ def add_output_dataset(
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path=f"/outputDatasetEdges/{self.quote(output_urn)}",
+ path=("outputDatasetEdges", output_urn),
value=output_edge,
)
return self
@@ -302,7 +259,7 @@ def remove_output_dataset(self, output: Union[str, Urn]) -> "DataJobPatchBuilder
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"remove",
- path=f"/outputDatasetEdges/{self.quote(str(output))}",
+ path=("outputDatasetEdges", output),
value={},
)
return self
@@ -327,7 +284,7 @@ def set_output_datasets(self, outputs: List[Edge]) -> "DataJobPatchBuilder":
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path="/outputDatasetEdges",
+ path=("outputDatasetEdges",),
value=outputs,
)
return self
@@ -351,7 +308,7 @@ def add_input_dataset_field(self, input: Union[Urn, str]) -> "DataJobPatchBuilde
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path=f"/inputDatasetFields/{self.quote(input_urn)}",
+ path=("inputDatasetFields", input_urn),
value={},
)
return self
@@ -372,7 +329,7 @@ def remove_input_dataset_field(
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"remove",
- path=f"/inputDatasetFields/{self.quote(input_urn)}",
+ path=("inputDatasetFields", input_urn),
value={},
)
return self
@@ -397,7 +354,7 @@ def set_input_dataset_fields(self, inputs: List[Edge]) -> "DataJobPatchBuilder":
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path="/inputDatasetFields",
+ path=("inputDatasetFields",),
value=inputs,
)
return self
@@ -423,7 +380,7 @@ def add_output_dataset_field(
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path=f"/outputDatasetFields/{self.quote(output_urn)}",
+ path=("outputDatasetFields", output_urn),
value={},
)
return self
@@ -444,7 +401,7 @@ def remove_output_dataset_field(
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"remove",
- path=f"/outputDatasetFields/{self.quote(output_urn)}",
+ path=("outputDatasetFields", output_urn),
value={},
)
return self
@@ -469,119 +426,7 @@ def set_output_dataset_fields(self, outputs: List[Edge]) -> "DataJobPatchBuilder
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
- path="/outputDatasetFields",
+ path=("outputDatasetFields",),
value=outputs,
)
return self
-
- def add_tag(self, tag: Tag) -> "DataJobPatchBuilder":
- """
- Adds a tag to the DataJobPatchBuilder.
-
- Args:
- tag: The Tag object representing the tag to be added.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- self._add_patch(
- GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag
- )
- return self
-
- def remove_tag(self, tag: Union[str, Urn]) -> "DataJobPatchBuilder":
- """
- Removes a tag from the DataJobPatchBuilder.
-
- Args:
- tag: The tag to remove, specified as a string or Urn object.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- if isinstance(tag, str) and not tag.startswith("urn:li:tag:"):
- tag = TagUrn.create_from_id(tag)
- self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={})
- return self
-
- def add_term(self, term: Term) -> "DataJobPatchBuilder":
- """
- Adds a glossary term to the DataJobPatchBuilder.
-
- Args:
- term: The Term object representing the glossary term to be added.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term
- )
- return self
-
- def remove_term(self, term: Union[str, Urn]) -> "DataJobPatchBuilder":
- """
- Removes a glossary term from the DataJobPatchBuilder.
-
- Args:
- term: The term to remove, specified as a string or Urn object.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"):
- term = "urn:li:glossaryTerm:" + term
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={}
- )
- return self
-
- def set_custom_properties(
- self, custom_properties: Dict[str, str]
- ) -> "DataJobPatchBuilder":
- """
- Sets the custom properties for the DataJobPatchBuilder.
-
- Args:
- custom_properties: A dictionary containing the custom properties to be set.
-
- Returns:
- The DataJobPatchBuilder instance.
-
- Notes:
- This method replaces all existing custom properties with the given dictionary.
- """
- self._add_patch(
- DataJobInfo.ASPECT_NAME,
- "add",
- path="/customProperties",
- value=custom_properties,
- )
- return self
-
- def add_custom_property(self, key: str, value: str) -> "DataJobPatchBuilder":
- """
- Adds a custom property to the DataJobPatchBuilder.
-
- Args:
- key: The key of the custom property.
- value: The value of the custom property.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- self.custom_properties_patch_helper.add_property(key, value)
- return self
-
- def remove_custom_property(self, key: str) -> "DataJobPatchBuilder":
- """
- Removes a custom property from the DataJobPatchBuilder.
-
- Args:
- key: The key of the custom property to remove.
-
- Returns:
- The DataJobPatchBuilder instance.
- """
- self.custom_properties_patch_helper.remove_property(key)
- return self
diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py
index f9830a4b23df0..d38d2d4156315 100644
--- a/metadata-ingestion/src/datahub/specific/dataproduct.py
+++ b/metadata-ingestion/src/datahub/specific/dataproduct.py
@@ -1,25 +1,25 @@
-from typing import Dict, List, Optional, Union
+from typing import List, Optional, Tuple
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
from datahub.metadata.schema_classes import (
DataProductAssociationClass as DataProductAssociation,
DataProductPropertiesClass as DataProductProperties,
- GlobalTagsClass as GlobalTags,
- GlossaryTermAssociationClass as Term,
- GlossaryTermsClass as GlossaryTerms,
KafkaAuditHeaderClass,
- OwnerClass as Owner,
- OwnershipTypeClass,
SystemMetadataClass,
- TagAssociationClass as Tag,
)
-from datahub.specific.custom_properties import CustomPropertiesPatchHelper
-from datahub.specific.ownership import OwnershipPatchHelper
-from datahub.utilities.urns.tag_urn import TagUrn
-from datahub.utilities.urns.urn import Urn
-
-
-class DataProductPatchBuilder(MetadataPatchProposal):
+from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
+from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
+from datahub.specific.aspect_helpers.tags import HasTagsPatch
+from datahub.specific.aspect_helpers.terms import HasTermsPatch
+
+
+class DataProductPatchBuilder(
+ HasOwnershipPatch,
+ HasCustomPropertiesPatch,
+ HasTagsPatch,
+ HasTermsPatch,
+ MetadataPatchProposal,
+):
def __init__(
self,
urn: str,
@@ -31,59 +31,16 @@ def __init__(
system_metadata=system_metadata,
audit_header=audit_header,
)
- self.custom_properties_patch_helper = CustomPropertiesPatchHelper(
- self, DataProductProperties.ASPECT_NAME
- )
- self.ownership_patch_helper = OwnershipPatchHelper(self)
-
- def add_owner(self, owner: Owner) -> "DataProductPatchBuilder":
- self.ownership_patch_helper.add_owner(owner)
- return self
-
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "DataProductPatchBuilder":
- """
- param: owner_type is optional
- """
- self.ownership_patch_helper.remove_owner(owner, owner_type)
- return self
-
- def set_owners(self, owners: List[Owner]) -> "DataProductPatchBuilder":
- self.ownership_patch_helper.set_owners(owners)
- return self
-
- def add_tag(self, tag: Tag) -> "DataProductPatchBuilder":
- self._add_patch(
- GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag
- )
- return self
-
- def remove_tag(self, tag: Union[str, Urn]) -> "DataProductPatchBuilder":
- if isinstance(tag, str) and not tag.startswith("urn:li:tag:"):
- tag = TagUrn.create_from_id(tag)
- self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={})
- return self
-
- def add_term(self, term: Term) -> "DataProductPatchBuilder":
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term
- )
- return self
- def remove_term(self, term: Union[str, Urn]) -> "DataProductPatchBuilder":
- if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"):
- term = "urn:li:glossaryTerm:" + term
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={}
- )
- return self
+ @classmethod
+ def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
+ return DataProductProperties.ASPECT_NAME, ("customProperties",)
def set_name(self, name: str) -> "DataProductPatchBuilder":
self._add_patch(
DataProductProperties.ASPECT_NAME,
"add",
- path="/name",
+ path=("name",),
value=name,
)
return self
@@ -92,37 +49,18 @@ def set_description(self, description: str) -> "DataProductPatchBuilder":
self._add_patch(
DataProductProperties.ASPECT_NAME,
"add",
- path="/description",
+ path=("description",),
value=description,
)
return self
- def set_custom_properties(
- self, custom_properties: Dict[str, str]
- ) -> "DataProductPatchBuilder":
- self._add_patch(
- DataProductProperties.ASPECT_NAME,
- "add",
- path="/customProperties",
- value=custom_properties,
- )
- return self
-
- def add_custom_property(self, key: str, value: str) -> "DataProductPatchBuilder":
- self.custom_properties_patch_helper.add_property(key, value)
- return self
-
- def remove_custom_property(self, key: str) -> "DataProductPatchBuilder":
- self.custom_properties_patch_helper.remove_property(key)
- return self
-
def set_assets(
self, assets: List[DataProductAssociation]
) -> "DataProductPatchBuilder":
self._add_patch(
DataProductProperties.ASPECT_NAME,
"add",
- path="/assets",
+ path=("assets",),
value=assets,
)
return self
@@ -131,7 +69,7 @@ def add_asset(self, asset_urn: str) -> "DataProductPatchBuilder":
self._add_patch(
DataProductProperties.ASPECT_NAME,
"add",
- path=f"/assets/{self.quote(asset_urn)}",
+ path=("assets", asset_urn),
value=DataProductAssociation(destinationUrn=asset_urn),
)
return self
@@ -140,7 +78,7 @@ def remove_asset(self, asset_urn: str) -> "DataProductPatchBuilder":
self._add_patch(
DataProductProperties.ASPECT_NAME,
"remove",
- path=f"/assets/{self.quote(asset_urn)}",
+ path=("assets", asset_urn),
value={},
)
return self
@@ -149,7 +87,7 @@ def set_external_url(self, external_url: str) -> "DataProductPatchBuilder":
self._add_patch(
DataProductProperties.ASPECT_NAME,
"add",
- path="/externalUrl",
+ path=("externalUrl",),
value=external_url,
)
return self
diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py
index b171dc4cc2939..6332386684bbf 100644
--- a/metadata-ingestion/src/datahub/specific/dataset.py
+++ b/metadata-ingestion/src/datahub/specific/dataset.py
@@ -1,27 +1,27 @@
-from typing import Dict, Generic, List, Optional, Tuple, TypeVar, Union
+from typing import Generic, List, Optional, Tuple, TypeVar, Union
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
+from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
from datahub.metadata.schema_classes import (
DatasetPropertiesClass as DatasetProperties,
EditableDatasetPropertiesClass as EditableDatasetProperties,
EditableSchemaMetadataClass as EditableSchemaMetadata,
FineGrainedLineageClass as FineGrainedLineage,
- GlobalTagsClass as GlobalTags,
GlossaryTermAssociationClass as Term,
- GlossaryTermsClass as GlossaryTerms,
KafkaAuditHeaderClass,
- OwnerClass as Owner,
- OwnershipTypeClass,
SchemaMetadataClass,
SystemMetadataClass,
TagAssociationClass as Tag,
UpstreamClass as Upstream,
UpstreamLineageClass as UpstreamLineage,
)
-from datahub.specific.custom_properties import CustomPropertiesPatchHelper
-from datahub.specific.ownership import OwnershipPatchHelper
-from datahub.specific.structured_properties import StructuredPropertiesPatchHelper
+from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
+from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
+from datahub.specific.aspect_helpers.structured_properties import (
+ HasStructuredPropertiesPatch,
+)
+from datahub.specific.aspect_helpers.tags import HasTagsPatch
+from datahub.specific.aspect_helpers.terms import HasTermsPatch
from datahub.utilities.urns.tag_urn import TagUrn
from datahub.utilities.urns.urn import Urn
@@ -48,7 +48,7 @@ def add_tag(self, tag: Tag) -> "FieldPatchHelper":
self._parent._add_patch(
self.aspect_name,
"add",
- path=f"/{self.aspect_field}/{self.field_path}/globalTags/tags/{tag.tag}",
+ path=(self.aspect_field, self.field_path, "globalTags", "tags", tag.tag),
value=tag,
)
return self
@@ -59,7 +59,7 @@ def remove_tag(self, tag: Union[str, Urn]) -> "FieldPatchHelper":
self._parent._add_patch(
self.aspect_name,
"remove",
- path=f"/{self.aspect_field}/{self.field_path}/globalTags/tags/{tag}",
+ path=(self.aspect_field, self.field_path, "globalTags", "tags", tag),
value={},
)
return self
@@ -68,7 +68,13 @@ def add_term(self, term: Term) -> "FieldPatchHelper":
self._parent._add_patch(
self.aspect_name,
"add",
- path=f"/{self.aspect_field}/{self.field_path}/glossaryTerms/terms/{term.urn}",
+ path=(
+ self.aspect_field,
+ self.field_path,
+ "glossaryTerms",
+ "terms",
+ term.urn,
+ ),
value=term,
)
return self
@@ -79,7 +85,7 @@ def remove_term(self, term: Union[str, Urn]) -> "FieldPatchHelper":
self._parent._add_patch(
self.aspect_name,
"remove",
- path=f"/{self.aspect_field}/{self.field_path}/glossaryTerms/terms/{term}",
+ path=(self.aspect_field, self.field_path, "glossaryTerms", "terms", term),
value={},
)
return self
@@ -88,7 +94,14 @@ def parent(self) -> _Parent:
return self._parent
-class DatasetPatchBuilder(MetadataPatchProposal):
+class DatasetPatchBuilder(
+ HasOwnershipPatch,
+ HasCustomPropertiesPatch,
+ HasStructuredPropertiesPatch,
+ HasTagsPatch,
+ HasTermsPatch,
+ MetadataPatchProposal,
+):
def __init__(
self,
urn: str,
@@ -98,34 +111,16 @@ def __init__(
super().__init__(
urn, system_metadata=system_metadata, audit_header=audit_header
)
- self.custom_properties_patch_helper = CustomPropertiesPatchHelper(
- self, DatasetProperties.ASPECT_NAME
- )
- self.ownership_patch_helper = OwnershipPatchHelper(self)
- self.structured_properties_patch_helper = StructuredPropertiesPatchHelper(self)
-
- def add_owner(self, owner: Owner) -> "DatasetPatchBuilder":
- self.ownership_patch_helper.add_owner(owner)
- return self
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "DatasetPatchBuilder":
- """
- param: owner_type is optional
- """
- self.ownership_patch_helper.remove_owner(owner, owner_type)
- return self
-
- def set_owners(self, owners: List[Owner]) -> "DatasetPatchBuilder":
- self.ownership_patch_helper.set_owners(owners)
- return self
+ @classmethod
+ def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
+ return DatasetProperties.ASPECT_NAME, ("customProperties",)
def add_upstream_lineage(self, upstream: Upstream) -> "DatasetPatchBuilder":
self._add_patch(
UpstreamLineage.ASPECT_NAME,
"add",
- path=f"/upstreams/{self.quote(upstream.dataset)}",
+ path=("upstreams", upstream.dataset),
value=upstream,
)
return self
@@ -136,14 +131,14 @@ def remove_upstream_lineage(
self._add_patch(
UpstreamLineage.ASPECT_NAME,
"remove",
- path=f"/upstreams/{dataset}",
+ path=("upstreams", dataset),
value={},
)
return self
def set_upstream_lineages(self, upstreams: List[Upstream]) -> "DatasetPatchBuilder":
self._add_patch(
- UpstreamLineage.ASPECT_NAME, "add", path="/upstreams", value=upstreams
+ UpstreamLineage.ASPECT_NAME, "add", path=("upstreams",), value=upstreams
)
return self
@@ -159,7 +154,7 @@ def add_fine_grained_upstream_lineage(
self._add_patch(
UpstreamLineage.ASPECT_NAME,
"add",
- path=DatasetPatchBuilder.quote_fine_grained_path(
+ path=self._build_fine_grained_path(
transform_op, downstream_urn, query_id, upstream_urn
),
value={"confidenceScore": fine_grained_lineage.confidenceScore},
@@ -179,12 +174,15 @@ def get_fine_grained_key(
return transform_op, downstream_urn, query_id
@classmethod
- def quote_fine_grained_path(
+ def _build_fine_grained_path(
cls, transform_op: str, downstream_urn: str, query_id: str, upstream_urn: str
- ) -> str:
+ ) -> PatchPath:
return (
- f"/fineGrainedLineages/{cls.quote(transform_op)}/"
- f"{cls.quote(downstream_urn)}/{cls.quote(query_id)}/{cls.quote(upstream_urn)}"
+ "fineGrainedLineages",
+ transform_op,
+ downstream_urn,
+ query_id,
+ upstream_urn,
)
def remove_fine_grained_upstream_lineage(
@@ -199,7 +197,7 @@ def remove_fine_grained_upstream_lineage(
self._add_patch(
UpstreamLineage.ASPECT_NAME,
"remove",
- path=DatasetPatchBuilder.quote_fine_grained_path(
+ path=self._build_fine_grained_path(
transform_op, downstream_urn, query_id, upstream_urn
),
value={},
@@ -212,37 +210,11 @@ def set_fine_grained_upstream_lineages(
self._add_patch(
UpstreamLineage.ASPECT_NAME,
"add",
- path="/fineGrainedLineages",
+ path=("fineGrainedLineages",),
value=fine_grained_lineages,
)
return self
- def add_tag(self, tag: Tag) -> "DatasetPatchBuilder":
- self._add_patch(
- GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag
- )
- return self
-
- def remove_tag(self, tag: Union[str, Urn]) -> "DatasetPatchBuilder":
- if isinstance(tag, str) and not tag.startswith("urn:li:tag:"):
- tag = TagUrn.create_from_id(tag)
- self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={})
- return self
-
- def add_term(self, term: Term) -> "DatasetPatchBuilder":
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term
- )
- return self
-
- def remove_term(self, term: Union[str, Urn]) -> "DatasetPatchBuilder":
- if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"):
- term = "urn:li:glossaryTerm:" + term
- self._add_patch(
- GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={}
- )
- return self
-
def for_field(
self, field_path: str, editable: bool = True
) -> FieldPatchHelper["DatasetPatchBuilder"]:
@@ -269,38 +241,11 @@ def set_description(
else EditableDatasetProperties.ASPECT_NAME
),
"add",
- path="/description",
+ path=("description",),
value=description,
)
return self
- def set_custom_properties(
- self, custom_properties: Dict[str, str]
- ) -> "DatasetPatchBuilder":
- self._add_patch(
- DatasetProperties.ASPECT_NAME,
- "add",
- path="/customProperties",
- value=custom_properties,
- )
- return self
-
- def add_custom_property(self, key: str, value: str) -> "DatasetPatchBuilder":
- self.custom_properties_patch_helper.add_property(key, value)
- return self
-
- def add_custom_properties(
- self, custom_properties: Optional[Dict[str, str]] = None
- ) -> "DatasetPatchBuilder":
- if custom_properties is not None:
- for key, value in custom_properties.items():
- self.custom_properties_patch_helper.add_property(key, value)
- return self
-
- def remove_custom_property(self, key: str) -> "DatasetPatchBuilder":
- self.custom_properties_patch_helper.remove_property(key)
- return self
-
def set_display_name(
self, display_name: Optional[str] = None
) -> "DatasetPatchBuilder":
@@ -308,7 +253,7 @@ def set_display_name(
self._add_patch(
DatasetProperties.ASPECT_NAME,
"add",
- path="/name",
+ path=("name",),
value=display_name,
)
return self
@@ -320,7 +265,7 @@ def set_qualified_name(
self._add_patch(
DatasetProperties.ASPECT_NAME,
"add",
- path="/qualifiedName",
+ path=("qualifiedName",),
value=qualified_name,
)
return self
@@ -332,7 +277,7 @@ def set_created(
self._add_patch(
DatasetProperties.ASPECT_NAME,
"add",
- path="/created",
+ path=("created",),
value=timestamp,
)
return self
@@ -344,37 +289,7 @@ def set_last_modified(
self._add_patch(
DatasetProperties.ASPECT_NAME,
"add",
- path="/lastModified",
+ path=("lastModified",),
value=timestamp,
)
return self
-
- def set_structured_property(
- self, property_name: str, value: Union[str, float, List[Union[str, float]]]
- ) -> "DatasetPatchBuilder":
- """
- This is a helper method to set a structured property.
- @param property_name: the name of the property (either bare or urn form)
- @param value: the value of the property (for multi-valued properties, this can be a list)
- """
- self.structured_properties_patch_helper.set_property(property_name, value)
- return self
-
- def add_structured_property(
- self, property_name: str, value: Union[str, float]
- ) -> "DatasetPatchBuilder":
- """
- This is a helper method to add a structured property.
- @param property_name: the name of the property (either bare or urn form)
- @param value: the value of the property (for multi-valued properties, this value will be appended to the list)
- """
- self.structured_properties_patch_helper.add_property(property_name, value)
- return self
-
- def remove_structured_property(self, property_name: str) -> "DatasetPatchBuilder":
- """
- This is a helper method to remove a structured property.
- @param property_name: the name of the property (either bare or urn form)
- """
- self.structured_properties_patch_helper.remove_property(property_name)
- return self
diff --git a/metadata-ingestion/src/datahub/specific/form.py b/metadata-ingestion/src/datahub/specific/form.py
index 78182c202f716..281b3cac99b2c 100644
--- a/metadata-ingestion/src/datahub/specific/form.py
+++ b/metadata-ingestion/src/datahub/specific/form.py
@@ -5,15 +5,13 @@
FormInfoClass as FormInfo,
FormPromptClass,
KafkaAuditHeaderClass,
- OwnerClass as Owner,
- OwnershipTypeClass,
SystemMetadataClass,
)
-from datahub.specific.ownership import OwnershipPatchHelper
+from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
from datahub.utilities.urns.urn import Urn
-class FormPatchBuilder(MetadataPatchProposal):
+class FormPatchBuilder(HasOwnershipPatch, MetadataPatchProposal):
def __init__(
self,
urn: str,
@@ -23,31 +21,13 @@ def __init__(
super().__init__(
urn, system_metadata=system_metadata, audit_header=audit_header
)
- self.ownership_patch_helper = OwnershipPatchHelper(self)
-
- def add_owner(self, owner: Owner) -> "FormPatchBuilder":
- self.ownership_patch_helper.add_owner(owner)
- return self
-
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "FormPatchBuilder":
- """
- param: owner_type is optional
- """
- self.ownership_patch_helper.remove_owner(owner, owner_type)
- return self
-
- def set_owners(self, owners: List[Owner]) -> "FormPatchBuilder":
- self.ownership_patch_helper.set_owners(owners)
- return self
def set_name(self, name: Optional[str] = None) -> "FormPatchBuilder":
if name is not None:
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path="/name",
+ path=("name",),
value=name,
)
return self
@@ -57,7 +37,7 @@ def set_description(self, description: Optional[str] = None) -> "FormPatchBuilde
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path="/description",
+ path=("description",),
value=description,
)
return self
@@ -67,7 +47,7 @@ def set_type(self, type: Optional[str] = None) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path="/type",
+ path=("type",),
value=type,
)
return self
@@ -76,7 +56,7 @@ def add_prompt(self, prompt: FormPromptClass) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path=f"/prompts/{self.quote(prompt.id)}",
+ path=("prompts", prompt.id),
value=prompt,
)
return self
@@ -90,7 +70,7 @@ def remove_prompt(self, prompt_id: str) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"remove",
- path=f"/prompts/{self.quote(prompt_id)}",
+ path=("prompts", prompt_id),
value=prompt_id,
)
return self
@@ -104,7 +84,7 @@ def set_ownership_form(self, is_ownership: bool) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path="/actors/owners",
+ path=("actors", "owners"),
value=is_ownership,
)
return self
@@ -113,7 +93,7 @@ def add_assigned_user(self, user_urn: Union[str, Urn]) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path=f"/actors/users/{self.quote(str(user_urn))}",
+ path=("actors", "users", user_urn),
value=user_urn,
)
return self
@@ -122,7 +102,7 @@ def remove_assigned_user(self, user_urn: Union[str, Urn]) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"remove",
- path=f"/actors/users/{self.quote(str(user_urn))}",
+ path=("actors", "users", user_urn),
value=user_urn,
)
return self
@@ -131,7 +111,7 @@ def add_assigned_group(self, group_urn: Union[str, Urn]) -> "FormPatchBuilder":
self._add_patch(
FormInfo.ASPECT_NAME,
"add",
- path=f"/actors/groups/{self.quote(str(group_urn))}",
+ path=("actors", "groups", group_urn),
value=group_urn,
)
return self
@@ -140,7 +120,7 @@ def remove_assigned_group(self, group_urn: Union[str, Urn]) -> "FormPatchBuilder
self._add_patch(
FormInfo.ASPECT_NAME,
"remove",
- path=f"/actors/groups/{self.quote(str(group_urn))}",
+ path=("actors", "groups", group_urn),
value=group_urn,
)
return self
diff --git a/metadata-ingestion/src/datahub/specific/ownership.py b/metadata-ingestion/src/datahub/specific/ownership.py
deleted file mode 100644
index b377a8814f38a..0000000000000
--- a/metadata-ingestion/src/datahub/specific/ownership.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Generic, List, Optional, TypeVar
-
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
-from datahub.metadata.schema_classes import (
- OwnerClass,
- OwnershipClass,
- OwnershipTypeClass,
-)
-
-_Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
-
-
-class OwnershipPatchHelper(Generic[_Parent]):
- def __init__(self, parent: _Parent) -> None:
- self._parent = parent
- self.aspect_field = OwnershipClass.ASPECT_NAME
-
- def parent(self) -> _Parent:
- return self._parent
-
- def add_owner(self, owner: OwnerClass) -> "OwnershipPatchHelper":
- self._parent._add_patch(
- OwnershipClass.ASPECT_NAME,
- "add",
- path=f"/owners/{owner.owner}/{owner.type}",
- value=owner,
- )
- return self
-
- def remove_owner(
- self, owner: str, owner_type: Optional[OwnershipTypeClass] = None
- ) -> "OwnershipPatchHelper":
- """
- param: owner_type is optional
- """
- self._parent._add_patch(
- OwnershipClass.ASPECT_NAME,
- "remove",
- path=f"/owners/{owner}" + (f"/{owner_type}" if owner_type else ""),
- value=owner,
- )
- return self
-
- def set_owners(self, owners: List[OwnerClass]) -> "OwnershipPatchHelper":
- self._parent._add_patch(
- OwnershipClass.ASPECT_NAME, "add", path="/owners", value=owners
- )
- return self
diff --git a/metadata-ingestion/src/datahub/specific/structured_properties.py b/metadata-ingestion/src/datahub/specific/structured_properties.py
deleted file mode 100644
index 17d896249c474..0000000000000
--- a/metadata-ingestion/src/datahub/specific/structured_properties.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from typing import Generic, List, TypeVar, Union
-
-from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
-from datahub.metadata.schema_classes import StructuredPropertyValueAssignmentClass
-from datahub.utilities.urns.structured_properties_urn import (
- make_structured_property_urn,
-)
-
-_Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
-
-
-class StructuredPropertiesPatchHelper(Generic[_Parent]):
- def __init__(
- self,
- parent: _Parent,
- aspect_name: str = "structuredProperties",
- ) -> None:
- self.aspect_name = aspect_name
- self._parent = parent
- self.aspect_field = "properties"
-
- def parent(self) -> _Parent:
- return self._parent
-
- def set_property(
- self, key: str, value: Union[str, float, List[Union[str, float]]]
- ) -> "StructuredPropertiesPatchHelper":
- self.remove_property(key)
- self.add_property(key, value)
- return self
-
- def remove_property(self, key: str) -> "StructuredPropertiesPatchHelper":
- self._parent._add_patch(
- self.aspect_name,
- "remove",
- path=(self.aspect_field, make_structured_property_urn(key)),
- value={},
- )
- return self
-
- def add_property(
- self, key: str, value: Union[str, float, List[Union[str, float]]]
- ) -> "StructuredPropertiesPatchHelper":
- self._parent._add_patch(
- self.aspect_name,
- "add",
- path=(self.aspect_field, make_structured_property_urn(key)),
- value=StructuredPropertyValueAssignmentClass(
- propertyUrn=make_structured_property_urn(key),
- values=value if isinstance(value, list) else [value],
- ),
- )
- return self
diff --git a/metadata-ingestion/src/datahub/specific/structured_property.py b/metadata-ingestion/src/datahub/specific/structured_property.py
index 50f1f079c2aa7..bcae174ed3c4f 100644
--- a/metadata-ingestion/src/datahub/specific/structured_property.py
+++ b/metadata-ingestion/src/datahub/specific/structured_property.py
@@ -29,7 +29,7 @@ def set_qualified_name(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/qualifiedName",
+ path=("qualifiedName",),
value=qualified_name,
)
return self
@@ -41,7 +41,7 @@ def set_display_name(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/displayName",
+ path=("displayName",),
value=display_name,
)
return self
@@ -53,7 +53,7 @@ def set_value_type(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/valueType",
+ path=("valueType",),
value=value_type,
)
return self
@@ -66,7 +66,7 @@ def set_type_qualifier(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/typeQualifier",
+ path=("typeQualifier",),
value=type_qualifier,
)
return self
@@ -78,7 +78,7 @@ def add_allowed_value(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path=f"/allowedValues/{str(allowed_value.get('value'))}",
+ path=("allowedValues", str(allowed_value.get("value"))),
value=allowed_value,
)
return self
@@ -87,7 +87,7 @@ def set_cardinality(self, cardinality: str) -> "StructuredPropertyPatchBuilder":
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/cardinality",
+ path=("cardinality",),
value=cardinality,
)
return self
@@ -98,7 +98,7 @@ def add_entity_type(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path=f"/entityTypes/{self.quote(str(entity_type))}",
+ path=("entityTypes", str(entity_type)),
value=entity_type,
)
return self
@@ -110,7 +110,7 @@ def set_description(
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/description",
+ path=("description",),
value=description,
)
return self
@@ -119,7 +119,7 @@ def set_immutable(self, immutable: bool) -> "StructuredPropertyPatchBuilder":
self._add_patch(
StructuredPropertyDefinition.ASPECT_NAME,
"add",
- path="/immutable",
+ path=("immutable",),
value=immutable,
)
return self
From 4a898e15945eff826e5e4cf3cce86bb237c8e5ea Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Thu, 2 Jan 2025 17:25:23 -0600
Subject: [PATCH 6/8] feat(auth): user.props authentication (#12259)
---
datahub-frontend/app/auth/AuthModule.java | 11 +++++--
.../app/config/ConfigurationProvider.java | 4 +++
.../upgrade/config/SystemUpdateConfig.java | 3 +-
docs/authentication/guides/add-users.md | 30 +++++++++++++++++++
docs/how/updating-datahub.md | 1 +
.../SampleDataFixtureConfiguration.java | 4 +--
.../SearchLineageFixtureConfiguration.java | 2 +-
.../MCLSpringCommonTestConfiguration.java | 3 +-
.../metadata/context/ActorContext.java | 16 +++++++---
.../metadata/context/OperationContext.java | 25 +++++++++++-----
.../context/TestOperationContexts.java | 3 +-
.../metadata/context/ActorContextTest.java | 25 ++++++++--------
.../context/OperationContextTest.java | 5 ++--
.../AuthenticationConfiguration.java | 3 ++
.../authorization/DataHubAuthorizerTest.java | 3 +-
.../src/main/resources/application.yaml | 3 ++
.../SystemOperationContextFactory.java | 6 ++--
.../IngestDataPlatformInstancesStepTest.java | 2 +-
18 files changed, 112 insertions(+), 37 deletions(-)
diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java
index b95515684f01f..3de0170fc7038 100644
--- a/datahub-frontend/app/auth/AuthModule.java
+++ b/datahub-frontend/app/auth/AuthModule.java
@@ -181,7 +181,12 @@ protected OperationContext provideOperationContext(
final Authentication systemAuthentication,
final ConfigurationProvider configurationProvider) {
ActorContext systemActorContext =
- ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build();
+ ActorContext.builder()
+ .systemAuth(true)
+ .authentication(systemAuthentication)
+ .enforceExistenceEnabled(
+ configurationProvider.getAuthentication().isEnforceExistenceEnabled())
+ .build();
OperationContextConfig systemConfig =
OperationContextConfig.builder()
.viewAuthorizationConfiguration(configurationProvider.getAuthorization().getView())
@@ -197,7 +202,9 @@ protected OperationContext provideOperationContext(
.entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY))
.validationContext(ValidationContext.builder().alternateValidation(false).build())
.retrieverContext(RetrieverContext.EMPTY)
- .build(systemAuthentication);
+ .build(
+ systemAuthentication,
+ configurationProvider.getAuthentication().isEnforceExistenceEnabled());
}
@Provides
diff --git a/datahub-frontend/app/config/ConfigurationProvider.java b/datahub-frontend/app/config/ConfigurationProvider.java
index 97e916769a6c4..9bc28be1bfc89 100644
--- a/datahub-frontend/app/config/ConfigurationProvider.java
+++ b/datahub-frontend/app/config/ConfigurationProvider.java
@@ -1,5 +1,6 @@
package config;
+import com.datahub.authentication.AuthenticationConfiguration;
import com.datahub.authorization.AuthorizationConfiguration;
import com.linkedin.metadata.config.VisualConfiguration;
import com.linkedin.metadata.config.cache.CacheConfiguration;
@@ -30,4 +31,7 @@ public class ConfigurationProvider {
/** Configuration for authorization */
private AuthorizationConfiguration authorization;
+
+ /** Configuration for authentication */
+ private AuthenticationConfiguration authentication;
}
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java
index fdd84da6044f7..d0493019a40af 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java
@@ -194,7 +194,8 @@ protected OperationContext javaSystemOperationContext(
ValidationContext.builder()
.alternateValidation(
configurationProvider.getFeatureFlags().isAlternateMCPValidation())
- .build());
+ .build(),
+ true);
entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext);
systemGraphRetriever.setSystemOperationContext(systemOperationContext);
diff --git a/docs/authentication/guides/add-users.md b/docs/authentication/guides/add-users.md
index 30da5c9f229f9..dbd44b6308678 100644
--- a/docs/authentication/guides/add-users.md
+++ b/docs/authentication/guides/add-users.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
# Onboarding Users to DataHub
New user accounts can be provisioned on DataHub in 3 ways:
@@ -94,6 +97,11 @@ using this mechanism. It is highly recommended that admins change or remove the
## Adding new users using a user.props file
+:::NOTE
+Adding users via the `user.props` will require disabling existence checks on GMS using the `METADATA_SERVICE_AUTH_ENFORCE_EXISTENCE_ENABLED=false` environment variable or using the API to enable the user prior to login.
+The directions below demonstrate using the API to enable the user.
+:::
+
To define a set of username / password combinations that should be allowed to log in to DataHub (in addition to the root 'datahub' user),
create a new file called `user.props` at the file path `${HOME}/.datahub/plugins/frontend/auth/user.props` within the `datahub-frontend-react` container
or pod.
@@ -107,6 +115,28 @@ janesmith:janespassword
johndoe:johnspassword
```
+In order to enable the user access with the credential defined in `user.props`, set the `status` aspect on the user with an Admin user. This can be done using an API call or via the [OpenAPI UI interface](/docs/api/openapi/openapi-usage-guide.md).
+
+
+
+
+Example enabling login for the `janesmith` user from the example above. Make sure to update the example with your access token.
+
+```shell
+curl -X 'POST' \
+ 'http://localhost:9002/openapi/v3/entity/corpuser/urn%3Ali%3Acorpuser%3Ajanesmith/status?async=false&systemMetadata=false&createIfEntityNotExists=false&createIfNotExists=true' \
+ -H 'accept: application/json' \
+ -H 'Content-Type: application/json' \
+ -H 'Authorization: Bearer ' \
+ -d '{
+ "value": {
+ "removed": false
+ }
+}'
+```
+
+
+
Once you've saved the file, simply start the DataHub containers & navigate to `http://localhost:9002/login`
to verify that your new credentials work.
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 19261da23bcf9..07577079d66d1 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -66,6 +66,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
changed to NOT fill out `created` and `lastModified` auditstamps by default
for input and output dataset edges. This should not have any user-observable
impact (time-based lineage viz will still continue working based on observed time), but could break assumptions previously being made by clients.
+- #12158 - Users provisioned with `user.props` will need to be enabled before login in order to be granted access to DataHub.
### Potential Downtime
diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java
index 5e387d7d88292..968f0dd4dd61e 100644
--- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java
+++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java
@@ -137,7 +137,7 @@ protected OperationContext sampleDataOperationContext(
return testOpContext.toBuilder()
.searchContext(SearchContext.builder().indexConvention(indexConvention).build())
- .build(testOpContext.getSessionAuthentication());
+ .build(testOpContext.getSessionAuthentication(), true);
}
@Bean(name = "longTailOperationContext")
@@ -148,7 +148,7 @@ protected OperationContext longTailOperationContext(
return testOpContext.toBuilder()
.searchContext(SearchContext.builder().indexConvention(indexConvention).build())
- .build(testOpContext.getSessionAuthentication());
+ .build(testOpContext.getSessionAuthentication(), true);
}
protected EntityIndexBuilders entityIndexBuildersHelper(OperationContext opContext) {
diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java
index b7b698c73ddac..26443e019829b 100644
--- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java
+++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java
@@ -162,7 +162,7 @@ protected OperationContext searchLineageOperationContext(
return testOpContext.toBuilder()
.searchContext(SearchContext.builder().indexConvention(indexConvention).build())
- .build(testOpContext.getSessionAuthentication());
+ .build(testOpContext.getSessionAuthentication(), true);
}
@Bean(name = "searchLineageESIndexBuilder")
diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java
index f16c9dbd82e74..c92749385145d 100644
--- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java
+++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java
@@ -95,7 +95,8 @@ public OperationContext operationContext(
mock(ServicesRegistryContext.class),
indexConvention,
TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry),
- mock(ValidationContext.class));
+ mock(ValidationContext.class),
+ true);
}
@MockBean SpringStandardPluginConfiguration springStandardPluginConfiguration;
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java
index c08b7fad4dee3..11e38dfb179e0 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java
@@ -29,23 +29,31 @@
@EqualsAndHashCode
public class ActorContext implements ContextInterface {
- public static ActorContext asSystem(Authentication systemAuthentication) {
- return ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build();
+ public static ActorContext asSystem(
+ Authentication systemAuthentication, boolean enforceExistenceEnabled) {
+ return ActorContext.builder()
+ .systemAuth(true)
+ .authentication(systemAuthentication)
+ .enforceExistenceEnabled(enforceExistenceEnabled)
+ .build();
}
public static ActorContext asSessionRestricted(
Authentication authentication,
Set dataHubPolicySet,
- Collection groupMembership) {
+ Collection groupMembership,
+ boolean enforceExistenceEnabled) {
return ActorContext.builder()
.systemAuth(false)
.authentication(authentication)
.policyInfoSet(dataHubPolicySet)
.groupMembership(groupMembership)
+ .enforceExistenceEnabled(enforceExistenceEnabled)
.build();
}
private final Authentication authentication;
+ private final boolean enforceExistenceEnabled;
@EqualsAndHashCode.Exclude @Builder.Default
private final Set policyInfoSet = Collections.emptySet();
@@ -79,7 +87,7 @@ public boolean isActive(AspectRetriever aspectRetriever) {
Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of());
- if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) {
+ if (enforceExistenceEnabled && !aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) {
// user is hard deleted
return false;
}
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java
index 9158129235b39..30255f7ebcac3 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java
@@ -152,7 +152,8 @@ public static OperationContext asSystem(
@Nullable ServicesRegistryContext servicesRegistryContext,
@Nullable IndexConvention indexConvention,
@Nullable RetrieverContext retrieverContext,
- @Nonnull ValidationContext validationContext) {
+ @Nonnull ValidationContext validationContext,
+ boolean enforceExistenceEnabled) {
return asSystem(
config,
systemAuthentication,
@@ -161,7 +162,8 @@ public static OperationContext asSystem(
indexConvention,
retrieverContext,
validationContext,
- ObjectMapperContext.DEFAULT);
+ ObjectMapperContext.DEFAULT,
+ enforceExistenceEnabled);
}
public static OperationContext asSystem(
@@ -172,10 +174,15 @@ public static OperationContext asSystem(
@Nullable IndexConvention indexConvention,
@Nullable RetrieverContext retrieverContext,
@Nonnull ValidationContext validationContext,
- @Nonnull ObjectMapperContext objectMapperContext) {
+ @Nonnull ObjectMapperContext objectMapperContext,
+ boolean enforceExistenceEnabled) {
ActorContext systemActorContext =
- ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build();
+ ActorContext.builder()
+ .systemAuth(true)
+ .authentication(systemAuthentication)
+ .enforceExistenceEnabled(enforceExistenceEnabled)
+ .build();
OperationContextConfig systemConfig =
config.toBuilder().allowSystemAuthentication(true).build();
SearchContext systemSearchContext =
@@ -457,13 +464,16 @@ public int hashCode() {
public static class OperationContextBuilder {
@Nonnull
- public OperationContext build(@Nonnull Authentication sessionAuthentication) {
- return build(sessionAuthentication, false);
+ public OperationContext build(
+ @Nonnull Authentication sessionAuthentication, boolean enforceExistenceEnabled) {
+ return build(sessionAuthentication, false, enforceExistenceEnabled);
}
@Nonnull
public OperationContext build(
- @Nonnull Authentication sessionAuthentication, boolean skipCache) {
+ @Nonnull Authentication sessionAuthentication,
+ boolean skipCache,
+ boolean enforceExistenceEnabled) {
final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr());
final ActorContext sessionActor =
ActorContext.builder()
@@ -476,6 +486,7 @@ public OperationContext build(
.equals(sessionAuthentication.getActor()))
.policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn))
.groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn))
+ .enforceExistenceEnabled(enforceExistenceEnabled)
.build();
return build(sessionActor, skipCache);
}
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java
index 4abfbb196f067..92d62d42295b9 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java
@@ -260,7 +260,8 @@ public static OperationContext systemContext(
servicesRegistryContext,
indexConvention,
retrieverContext,
- validationContext);
+ validationContext,
+ true);
if (postConstruct != null) {
postConstruct.accept(operationContext);
diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java
index 15fe2bc277b9b..de6f71408e258 100644
--- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java
+++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java
@@ -87,42 +87,43 @@ public void actorContextId() {
Authentication userAuth = new Authentication(new Actor(ActorType.USER, "USER"), "");
assertEquals(
- ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of()).getCacheKeyComponent(),
- ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of()).getCacheKeyComponent(),
+ ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of(), true).getCacheKeyComponent(),
+ ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of(), true).getCacheKeyComponent(),
"Expected equality across instances");
assertEquals(
- ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of()).getCacheKeyComponent(),
+ ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of(), true).getCacheKeyComponent(),
ActorContext.asSessionRestricted(
- userAuth, Set.of(), Set.of(UrnUtils.getUrn("urn:li:corpGroup:group1")))
+ userAuth, Set.of(), Set.of(UrnUtils.getUrn("urn:li:corpGroup:group1")), true)
.getCacheKeyComponent(),
"Expected no impact to cache context from group membership");
assertEquals(
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of(), true)
.getCacheKeyComponent(),
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of(), true)
.getCacheKeyComponent(),
"Expected equality when non-ownership policies are identical");
assertNotEquals(
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC_RESOURCE, POLICY_D), Set.of())
+ ActorContext.asSessionRestricted(
+ userAuth, Set.of(POLICY_ABC_RESOURCE, POLICY_D), Set.of(), true)
.getCacheKeyComponent(),
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of(), true)
.getCacheKeyComponent(),
"Expected differences with non-identical resource policy");
assertNotEquals(
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER), Set.of(), true)
.getCacheKeyComponent(),
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of(), true)
.getCacheKeyComponent(),
"Expected differences with ownership policy");
assertNotEquals(
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER_TYPE), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER_TYPE), Set.of(), true)
.getCacheKeyComponent(),
- ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of())
+ ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of(), true)
.getCacheKeyComponent(),
"Expected differences with ownership type policy");
}
diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java
index f77b244d8f2d8..a2575c1c56220 100644
--- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java
+++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java
@@ -27,7 +27,8 @@ public void testSystemPrivilegeEscalation() {
mock(ServicesRegistryContext.class),
null,
TestOperationContexts.emptyActiveUsersRetrieverContext(null),
- mock(ValidationContext.class));
+ mock(ValidationContext.class),
+ true);
OperationContext opContext =
systemOpContext.asSession(RequestContext.TEST, Authorizer.EMPTY, userAuth);
@@ -51,7 +52,7 @@ public void testSystemPrivilegeEscalation() {
systemOpContext.getOperationContextConfig().toBuilder()
.allowSystemAuthentication(false)
.build())
- .build(userAuth);
+ .build(userAuth, true);
assertEquals(
opContextNoSystem.getAuthentication(),
diff --git a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java
index 442263bbd6b43..81cc5e60552a7 100644
--- a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java
+++ b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java
@@ -9,6 +9,9 @@ public class AuthenticationConfiguration {
/** Whether authentication is enabled */
private boolean enabled;
+ /** Whether user existence is enforced */
+ private boolean enforceExistenceEnabled;
+
/**
* List of configurations for {@link com.datahub.plugins.auth.authentication.Authenticator}s to be
* registered
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
index 4437682bfeb0a..ce9c636be16ac 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
@@ -320,7 +320,8 @@ public void setupTest() throws Exception {
mock(ServicesRegistryContext.class),
mock(IndexConvention.class),
mock(RetrieverContext.class),
- mock(ValidationContext.class));
+ mock(ValidationContext.class),
+ true);
_dataHubAuthorizer =
new DataHubAuthorizer(
diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml
index f6fa4a37fdadb..c029cb4648d01 100644
--- a/metadata-service/configuration/src/main/resources/application.yaml
+++ b/metadata-service/configuration/src/main/resources/application.yaml
@@ -6,6 +6,9 @@ authentication:
# Enable if you want all requests to the Metadata Service to be authenticated.
enabled: ${METADATA_SERVICE_AUTH_ENABLED:true}
+ # Disable if you want to skip validation of deleted user's tokens
+ enforceExistenceEnabled: ${METADATA_SERVICE_AUTH_ENFORCE_EXISTENCE_ENABLED:true}
+
# Required if enabled is true! A configurable chain of Authenticators
authenticators:
# Required for authenticating requests with DataHub-issued Access Tokens - best not to remove.
diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java
index 3e2823591e168..78107cc0ecc90 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java
@@ -79,7 +79,8 @@ protected OperationContext javaSystemOperationContext(
ValidationContext.builder()
.alternateValidation(
configurationProvider.getFeatureFlags().isAlternateMCPValidation())
- .build());
+ .build(),
+ configurationProvider.getAuthentication().isEnforceExistenceEnabled());
entityClientAspectRetriever.setSystemOperationContext(systemOperationContext);
entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext);
@@ -134,7 +135,8 @@ protected OperationContext restliSystemOperationContext(
ValidationContext.builder()
.alternateValidation(
configurationProvider.getFeatureFlags().isAlternateMCPValidation())
- .build());
+ .build(),
+ configurationProvider.getAuthentication().isEnforceExistenceEnabled());
entityClientAspectRetriever.setSystemOperationContext(systemOperationContext);
systemGraphRetriever.setSystemOperationContext(systemOperationContext);
diff --git a/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java b/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java
index cc21819cf4ab5..b47c779f768a9 100644
--- a/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java
+++ b/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java
@@ -87,7 +87,7 @@ public void testExecuteChecksKeySpecForAllUrns() throws Exception {
mockOpContext =
mockOpContext.toBuilder()
.entityRegistryContext(spyEntityRegistryContext)
- .build(mockOpContext.getSessionAuthentication());
+ .build(mockOpContext.getSessionAuthentication(), true);
mockDBWithWorkToDo(migrationsDao, countOfCorpUserEntities, countOfChartEntities);
From 539f521388a9ad86ce9565a819b31d0da8f8d5b2 Mon Sep 17 00:00:00 2001
From: Gabe Lyons
Date: Thu, 2 Jan 2025 15:56:54 -0800
Subject: [PATCH 7/8] docs(delete): Document un-soft-delete commands in
delete-metadata.md (#12251)
---
docs/how/delete-metadata.md | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md
index e36940bf39835..1b1a9952f7898 100644
--- a/docs/how/delete-metadata.md
+++ b/docs/how/delete-metadata.md
@@ -97,6 +97,21 @@ The start and end time fields filter on the `timestampMillis` field of the times
- `ddddddddd` (e.g. `1684384045`): a unix timestamp
- `min`, `max`, `now`: special keywords
+#### Undo-ing soft deletion of entities
+
+You can restore soft-deleted entities using the `undo-by-filter` command. This reverts the effect of a soft delete.
+
+```shell
+# Restore (un-soft-delete) a single soft-deleted entity
+datahub delete undo-by-filter --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)"
+
+# Restore all soft-deleted entities from a specific platform
+datahub delete undo-by-filter --platform snowflake
+
+# You can adjust the batch size (default 3000, max 10000) for better performance
+datahub delete undo-by-filter --platform snowflake --batch-size 5000
+```
+
## Delete CLI Examples
:::note
From 1190dd95b2cedadbd2a5e8295d47497333b2288b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?=
Date: Fri, 3 Jan 2025 09:15:53 +0100
Subject: [PATCH 8/8] fix(tableau): fixes some aspects being emitted multiple
times (#12258)
---
.../ingestion/source/tableau/tableau.py | 3 ++
.../source/tableau/tableau_common.py | 18 ++++++++
.../tests/unit/test_tableau_source.py | 46 ++++++++++++++++++-
3 files changed, 66 insertions(+), 1 deletion(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
index d47e10c9eb5c6..008216fea8950 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -109,6 +109,7 @@
make_filter,
make_fine_grained_lineage_class,
make_upstream_class,
+ optimize_query_filter,
published_datasource_graphql_query,
query_metadata_cursor_based_pagination,
sheet_graphql_query,
@@ -1363,6 +1364,8 @@ def get_connection_objects(
query_filter: dict = {},
page_size_override: Optional[int] = None,
) -> Iterable[dict]:
+ query_filter = optimize_query_filter(query_filter)
+
# Calls the get_connection_object_page function to get the objects,
# and automatically handles pagination.
page_size = page_size_override or self.config.page_size
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py
index 61b56c4bee5bd..8f9d81eb9a18c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py
@@ -1,3 +1,4 @@
+import copy
import html
import json
import logging
@@ -35,6 +36,7 @@
UpstreamClass,
)
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
+from datahub.utilities.ordered_set import OrderedSet
logger = logging.getLogger(__name__)
@@ -1000,3 +1002,19 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
]
return filter_pages
+
+
+def optimize_query_filter(query_filter: dict) -> dict:
+ """
+ Duplicates in the filter cause duplicates in the result,
+ leading to entities/aspects being emitted multiple times unnecessarily
+ """
+ optimized_query = copy.deepcopy(query_filter)
+
+ if query_filter.get(c.ID_WITH_IN):
+ optimized_query[c.ID_WITH_IN] = list(OrderedSet(query_filter[c.ID_WITH_IN]))
+ if query_filter.get(c.PROJECT_NAME_WITH_IN):
+ optimized_query[c.PROJECT_NAME_WITH_IN] = list(
+ OrderedSet(query_filter[c.PROJECT_NAME_WITH_IN])
+ )
+ return optimized_query
diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py
index 44e59decaecbd..227519fdb464a 100644
--- a/metadata-ingestion/tests/unit/test_tableau_source.py
+++ b/metadata-ingestion/tests/unit/test_tableau_source.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any, Dict, List
import pytest
@@ -7,6 +7,7 @@
from datahub.ingestion.source.tableau.tableau_common import (
get_filter_pages,
make_filter,
+ optimize_query_filter,
tableau_field_to_schema_field,
)
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
@@ -203,3 +204,46 @@ def test_get_filter_pages_id_filter_splits_into_multiple_filters():
{c.ID_WITH_IN: filter_dict[c.ID_WITH_IN][i : i + page_size]}
for i in range(0, num_ids, page_size)
]
+
+
+def test_optimize_query_filter_removes_duplicates():
+ query_filter = {
+ c.ID_WITH_IN: ["id1", "id2", "id1"],
+ c.PROJECT_NAME_WITH_IN: ["project1", "project2", "project1"],
+ }
+ result = optimize_query_filter(query_filter)
+ assert len(result) == 2
+ assert result[c.ID_WITH_IN] == ["id1", "id2"]
+ assert result[c.PROJECT_NAME_WITH_IN] == ["project1", "project2"]
+
+
+def test_optimize_query_filter_handles_empty_lists():
+ query_filter: Dict[str, List[str]] = {c.ID_WITH_IN: [], c.PROJECT_NAME_WITH_IN: []}
+ result = optimize_query_filter(query_filter)
+ assert len(result) == 2
+ assert result[c.ID_WITH_IN] == []
+ assert result[c.PROJECT_NAME_WITH_IN] == []
+
+
+def test_optimize_query_filter_handles_missing_keys():
+ query_filter: Dict[str, List[str]] = {}
+ result = optimize_query_filter(query_filter)
+ assert result == {}
+
+
+def test_optimize_query_filter_handles_other_keys():
+ query_filter = {"any_other_key": ["id1", "id2", "id1"]}
+ result = optimize_query_filter(query_filter)
+ assert len(result) == 1
+ assert result["any_other_key"] == ["id1", "id2", "id1"]
+
+
+def test_optimize_query_filter_handles_no_duplicates():
+ query_filter = {
+ c.ID_WITH_IN: ["id1", "id2"],
+ c.PROJECT_NAME_WITH_IN: ["project1", "project2"],
+ }
+ result = optimize_query_filter(query_filter)
+ assert len(result) == 2
+ assert result[c.ID_WITH_IN] == ["id1", "id2"]
+ assert result[c.PROJECT_NAME_WITH_IN] == ["project1", "project2"]