From 20eed21e27526e752765963fc6e03d5244f331d0 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Tue, 29 Oct 2024 10:50:12 +0530 Subject: [PATCH 01/11] doc: add missed breaking change note (#11725) --- docs/how/updating-datahub.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index db97f7aa81d7bb..989ebc6be4e734 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -77,6 +77,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #11313 - `datahub get` will no longer return a key aspect for entities that don't exist. - #11369 - The default datahub-rest sink mode has been changed to `ASYNC_BATCH`. This requires a server with version 0.14.0+. - #11214 Container properties aspect will produce an additional field that will require a corresponding upgrade of server. Otherwise server can reject the aspects. +- #10190 - `extractor_config.set_system_metadata` of `datahub` source has been moved to be a top level config in the recipe under `flags.set_system_metadata` ### Potential Downtime From a11ac8d104649c953037d4b85afac67d6828ad18 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 29 Oct 2024 01:13:34 -0700 Subject: [PATCH 02/11] feat(ingest/fivetran): avoid duplicate table lineage entries (#11712) --- .../ingestion/source/fivetran/config.py | 4 +- .../ingestion/source/fivetran/fivetran.py | 49 ++++-- .../source/fivetran/fivetran_log_api.py | 9 +- .../source/fivetran/fivetran_query.py | 85 ++++++---- ...nowflake_empty_connection_user_golden.json | 140 ++++++++-------- .../fivetran/fivetran_snowflake_golden.json | 156 +++++++++--------- 6 files changed, 249 insertions(+), 194 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index 1e15f6b395ca58..e40e284d6e0a42 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -144,8 +144,8 @@ class FivetranSourceReport(StaleEntityRemovalSourceReport): def report_connectors_scanned(self, count: int = 1) -> None: self.connectors_scanned += count - def report_connectors_dropped(self, model: str) -> None: - self.filtered_connectors.append(model) + def report_connectors_dropped(self, connector: str) -> None: + self.filtered_connectors.append(connector) class PlatformDetail(ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 907bfa3a167aa4..21c967e162891c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -76,7 +76,7 @@ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext): self.audit_log = FivetranLogAPI(self.config.fivetran_log_config) - def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: + def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]: input_dataset_urn_list: List[DatasetUrn] = [] output_dataset_urn_list: List[DatasetUrn] = [] fine_grained_lineage: List[FineGrainedLineage] = [] @@ -93,8 +93,11 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: connector.connector_type ] else: - logger.info( - f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." + self.report.info( + title="Guessing source platform for lineage", + message="We encountered a connector type that we don't fully support yet. " + "We will attempt to guess the platform based on the connector type.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})", ) source_details.platform = connector.connector_type @@ -170,7 +173,19 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: datajob.inlets.extend(input_dataset_urn_list) datajob.outlets.extend(output_dataset_urn_list) datajob.fine_grained_lineages.extend(fine_grained_lineage) - return None + + return dict( + **{ + f"source.{k}": str(v) + for k, v in source_details.dict().items() + if v is not None + }, + **{ + f"destination.{k}": str(v) + for k, v in destination_details.dict().items() + if v is not None + }, + ) def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow: return DataFlow( @@ -196,23 +211,23 @@ def _generate_datajob_from_connector(self, connector: Connector) -> DataJob: owners={owner_email} if owner_email else set(), ) - job_property_bag: Dict[str, str] = {} - allowed_connection_keys = [ - Constant.PAUSED, - Constant.SYNC_FREQUENCY, - Constant.DESTINATION_ID, - ] - for key in allowed_connection_keys: - if hasattr(connector, key) and getattr(connector, key) is not None: - job_property_bag[key] = repr(getattr(connector, key)) - datajob.properties = job_property_bag - # Map connector source and destination table with dataset entity # Also extend the fine grained lineage of column if include_column_lineage is True - self._extend_lineage(connector=connector, datajob=datajob) - + lineage_properties = self._extend_lineage(connector=connector, datajob=datajob) # TODO: Add fine grained lineages of dataset after FineGrainedLineageDownstreamType.DATASET enabled + connector_properties: Dict[str, str] = { + "connector_id": connector.connector_id, + "connector_type": connector.connector_type, + "paused": str(connector.paused), + "sync_frequency": str(connector.sync_frequency), + "destination_id": connector.destination_id, + } + datajob.properties = { + **connector_properties, + **lineage_properties, + } + return datajob def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 79f9d513bfb7c4..529002270cdd9c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -259,20 +259,23 @@ def get_allowed_connectors_list( logger.info("Fetching connector list") connector_list = self._query(self.fivetran_log_query.get_connectors_query()) for connector in connector_list: + connector_id = connector[Constant.CONNECTOR_ID] connector_name = connector[Constant.CONNECTOR_NAME] if not connector_patterns.allowed(connector_name): - report.report_connectors_dropped(connector_name) + report.report_connectors_dropped( + f"{connector_name} (connector_id: {connector_id}, dropped due to filter pattern)" + ) continue if not destination_patterns.allowed( destination_id := connector[Constant.DESTINATION_ID] ): report.report_connectors_dropped( - f"{connector_name} (destination_id: {destination_id})" + f"{connector_name} (connector_id: {connector_id}, destination_id: {destination_id})" ) continue connectors.append( Connector( - connector_id=connector[Constant.CONNECTOR_ID], + connector_id=connector_id, connector_name=connector_name, connector_type=connector[Constant.CONNECTOR_TYPE_ID], paused=connector[Constant.PAUSED], diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index 39c4d7712b4fcc..65378928b244dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -1,8 +1,8 @@ from typing import List # Safeguards to prevent fetching massive amounts of data. -MAX_TABLE_LINEAGE_PER_CONNECTOR = 50 -MAX_COLUMN_LINEAGE_PER_CONNECTOR = 500 +MAX_TABLE_LINEAGE_PER_CONNECTOR = 120 +MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000 MAX_JOBS_PER_CONNECTOR = 500 @@ -33,6 +33,7 @@ def get_connectors_query(self) -> str: FROM {self.db_clause}connector WHERE _fivetran_deleted = FALSE +QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1 """ def get_users_query(self) -> str: @@ -86,21 +87,29 @@ def get_table_lineage_query(self, connector_ids: List[str]) -> str: return f"""\ SELECT - stm.connector_id as connector_id, - stm.id as source_table_id, - stm.name as source_table_name, - ssm.name as source_schema_name, - dtm.id as destination_table_id, - dtm.name as destination_table_name, - dsm.name as destination_schema_name -FROM {self.db_clause}table_lineage as tl -JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id -JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id -JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id -JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id -WHERE stm.connector_id IN ({formatted_connector_ids}) -QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR} -ORDER BY stm.connector_id, tl.created_at DESC + * +FROM ( + SELECT + stm.connector_id as connector_id, + stm.id as source_table_id, + stm.name as source_table_name, + ssm.name as source_schema_name, + dtm.id as destination_table_id, + dtm.name as destination_table_name, + dsm.name as destination_schema_name, + tl.created_at as created_at, + ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn + FROM {self.db_clause}table_lineage as tl + JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id + JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id + JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id + JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id + WHERE stm.connector_id IN ({formatted_connector_ids}) +) +-- Ensure that we only get back one entry per source and destination pair. +WHERE table_combo_rn = 1 +QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR} +ORDER BY connector_id, created_at DESC """ def get_column_lineage_query(self, connector_ids: List[str]) -> str: @@ -109,19 +118,31 @@ def get_column_lineage_query(self, connector_ids: List[str]) -> str: return f"""\ SELECT - scm.table_id as source_table_id, - dcm.table_id as destination_table_id, - scm.name as source_column_name, - dcm.name as destination_column_name -FROM {self.db_clause}column_lineage as cl -JOIN {self.db_clause}source_column_metadata as scm - ON cl.source_column_id = scm.id -JOIN {self.db_clause}destination_column_metadata as dcm - ON cl.destination_column_id = dcm.id --- Only joining source_table_metadata to get the connector_id. -JOIN {self.db_clause}source_table_metadata as stm - ON scm.table_id = stm.id -WHERE stm.connector_id IN ({formatted_connector_ids}) -QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR} -ORDER BY stm.connector_id, cl.created_at DESC + source_table_id, + destination_table_id, + source_column_name, + destination_column_name +FROM ( + SELECT + stm.connector_id as connector_id, + scm.table_id as source_table_id, + dcm.table_id as destination_table_id, + scm.name as source_column_name, + dcm.name as destination_column_name, + cl.created_at as created_at, + ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn + FROM {self.db_clause}column_lineage as cl + JOIN {self.db_clause}source_column_metadata as scm + ON cl.source_column_id = scm.id + JOIN {self.db_clause}destination_column_metadata as dcm + ON cl.destination_column_id = dcm.id + -- Only joining source_table_metadata to get the connector_id. + JOIN {self.db_clause}source_table_metadata as stm + ON scm.table_id = stm.id + WHERE stm.connector_id IN ({formatted_connector_ids}) +) +-- Ensure that we only get back one entry per (connector, source column, destination column) pair. +WHERE column_combo_rn = 1 +QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR} +ORDER BY connector_id, created_at DESC """ diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_empty_connection_user_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_empty_connection_user_golden.json index 29b186978a76a5..0f8f4cc64e7ca4 100644 --- a/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_empty_connection_user_golden.json +++ b/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_empty_connection_user_golden.json @@ -17,6 +17,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", @@ -62,9 +78,17 @@ "aspect": { "json": { "customProperties": { + "connector_id": "calendar_elected", + "connector_type": "postgres", "paused": "False", "sync_frequency": "1440", - "destination_id": "'interval_unconstitutional'" + "destination_id": "interval_unconstitutional", + "source.platform": "postgres", + "source.env": "DEV", + "source.database": "postgres_db", + "destination.platform": "snowflake", + "destination.env": "PROD", + "destination.database": "test_database" }, "name": "postgres", "type": { @@ -79,6 +103,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", @@ -150,13 +190,18 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "ownership", "aspect": { "json": { - "removed": false + "owners": [], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } } }, "systemMetadata": { @@ -166,13 +211,13 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "globalTags", "aspect": { "json": { - "removed": false + "tags": [] } }, "systemMetadata": { @@ -182,18 +227,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "status", "aspect": { "json": { - "owners": [], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:fivetran" - } + "removed": false } }, "systemMetadata": { @@ -203,13 +243,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", "changeType": "UPSERT", - "aspectName": "globalTags", + "aspectName": "status", "aspect": { "json": { - "tags": [] + "removed": false } }, "systemMetadata": { @@ -304,8 +344,8 @@ "json": { "timestampMillis": 1695191853000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -325,8 +365,8 @@ "json": { "timestampMillis": 1695191885000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -427,8 +467,8 @@ "json": { "timestampMillis": 1696343730000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -448,8 +488,8 @@ "json": { "timestampMillis": 1696343732000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -550,8 +590,8 @@ "json": { "timestampMillis": 1696343755000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -571,8 +611,8 @@ "json": { "timestampMillis": 1696343790000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -587,38 +627,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "powerbi-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "powerbi-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_golden.json index 0cd3bb83f90f52..22933f3483e76d 100644 --- a/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_golden.json +++ b/metadata-ingestion/tests/integration/fivetran/fivetran_snowflake_golden.json @@ -17,6 +17,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", @@ -62,9 +78,17 @@ "aspect": { "json": { "customProperties": { + "connector_id": "calendar_elected", + "connector_type": "postgres", "paused": "False", "sync_frequency": "1440", - "destination_id": "'interval_unconstitutional'" + "destination_id": "interval_unconstitutional", + "source.platform": "postgres", + "source.env": "DEV", + "source.database": "postgres_db", + "destination.platform": "snowflake", + "destination.env": "PROD", + "destination.database": "test_database" }, "name": "postgres", "type": { @@ -79,6 +103,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", @@ -150,13 +190,26 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "ownership", "aspect": { "json": { - "removed": false + "owners": [ + { + "owner": "urn:li:corpuser:abc.xyz@email.com", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } } }, "systemMetadata": { @@ -166,13 +219,13 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "globalTags", "aspect": { "json": { - "removed": false + "tags": [] } }, "systemMetadata": { @@ -182,26 +235,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "status", "aspect": { "json": { - "owners": [ - { - "owner": "urn:li:corpuser:abc.xyz@email.com", - "type": "DEVELOPER", - "source": { - "type": "SERVICE" - } - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:fivetran" - } + "removed": false } }, "systemMetadata": { @@ -211,13 +251,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", "changeType": "UPSERT", - "aspectName": "globalTags", + "aspectName": "status", "aspect": { "json": { - "tags": [] + "removed": false } }, "systemMetadata": { @@ -312,8 +352,8 @@ "json": { "timestampMillis": 1695191853000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -333,8 +373,8 @@ "json": { "timestampMillis": 1695191885000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -435,8 +475,8 @@ "json": { "timestampMillis": 1696343730000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -456,8 +496,8 @@ "json": { "timestampMillis": 1696343732000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -558,8 +598,8 @@ "json": { "timestampMillis": 1696343755000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -579,8 +619,8 @@ "json": { "timestampMillis": 1696343790000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -595,38 +635,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "powerbi-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "powerbi-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", From bb63cbd9db1b791c220ef57ce3d5c839c5e051d3 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Tue, 29 Oct 2024 13:48:08 +0530 Subject: [PATCH 03/11] fix(ingestion/bigquery): Add lineage extraction for BigQuery with GCS source (#11442) --- .../recipes/bigquery_to_datahub.dhub.yaml | 17 ++ .../ingestion/source/bigquery_v2/bigquery.py | 10 + .../source/bigquery_v2/bigquery_config.py | 40 +++ .../source/bigquery_v2/bigquery_report.py | 3 + .../source/bigquery_v2/bigquery_schema.py | 2 + .../source/bigquery_v2/bigquery_schema_gen.py | 14 ++ .../ingestion/source/bigquery_v2/lineage.py | 196 +++++++++++++++ .../unit/bigquery/test_bigquery_lineage.py | 227 +++++++++++++++++- 8 files changed, 507 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/examples/recipes/bigquery_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/bigquery_to_datahub.dhub.yaml index 86f4898d9d5026..4210d0599a2157 100644 --- a/metadata-ingestion/examples/recipes/bigquery_to_datahub.dhub.yaml +++ b/metadata-ingestion/examples/recipes/bigquery_to_datahub.dhub.yaml @@ -41,6 +41,23 @@ source: # deny: # - "*.*.*" #storage_project_id: project-id-1234567 + ## Lineage with GCS Source + # include_column_lineage_with_gcs: true/false + # gcs_lineage_config: + # path_specs: + # - include: "gs://my-bucket/foo/tests/bar.avro" + # - include: "gs://my-bucket/foo/tests/*.*" + # - include: "gs://my-bucket/foo/tests/{table}/*.avro" + # - include: "gs://my-bucket/foo/tests/{table}/*/*.avro" + # - include: "gs://my-bucket/foo/tests/{table}/*.*" + # - include: "gs://my-bucket/{dept}/tests/{table}/*.avro" + # - include: "gs://my-bucket/{dept}/tests/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.avro" + # - include: "gs://my-bucket/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.avro" + # - include: "gs://my-bucket/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.*" + # - include: "gs://my-bucket/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.*" + # - include: "gs://my-bucket/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.*" + # strip_urls: false + ## see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation sink: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index a1bbb9dd6b0b90..76c2fbf48ccaba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -309,6 +309,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.bq_schema_extractor.table_refs, ) + # Lineage BQ to GCS + if ( + self.config.include_table_lineage + and self.bq_schema_extractor.external_tables + ): + for dataset_urn, table in self.bq_schema_extractor.external_tables.items(): + yield from self.lineage_extractor.gen_lineage_workunits_for_external_table( + dataset_urn, table.ddl, graph=self.ctx.graph + ) + def get_report(self) -> BigQueryV2Report: return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 331b583423093b..ad293c702a5205 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -21,6 +21,7 @@ from datahub.ingestion.glossary.classification_mixin import ( ClassificationSourceConfigMixin, ) +from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, @@ -206,6 +207,39 @@ def get_sql_alchemy_url(self) -> str: return "bigquery://" +class GcsLineageProviderConfig(ConfigModel): + """ + Any source that produces gcs lineage from/to Datasets should inherit this class. + """ + + path_specs: List[PathSpec] = Field( + default=[], + description="List of PathSpec. See below the details about PathSpec", + ) + + strip_urls: bool = Field( + default=True, + description="Strip filename from gcs url. It only applies if path_specs are not specified.", + ) + + ignore_non_path_spec_path: bool = Field( + default=False, + description="Ignore paths that are not match in path_specs. It only applies if path_specs are specified.", + ) + + +class GcsDatasetLineageProviderConfigBase(ConfigModel): + """ + Any source that produces gcs lineage from/to Datasets should inherit this class. + This is needeed to group all lineage related configs under `gcs_lineage_config` config property. + """ + + gcs_lineage_config: GcsLineageProviderConfig = Field( + default=GcsLineageProviderConfig(), + description="Common config for gcs lineage generation", + ) + + class BigQueryFilterConfig(SQLFilterConfig): project_ids: List[str] = Field( default_factory=list, @@ -328,6 +362,7 @@ class BigQueryIdentifierConfig( class BigQueryV2Config( + GcsDatasetLineageProviderConfigBase, BigQueryConnectionConfig, BigQueryBaseConfig, BigQueryFilterConfig, @@ -473,6 +508,11 @@ def have_table_data_read_permission(self) -> bool: description="Option to enable/disable lineage generation. Is enabled by default.", ) + include_column_lineage_with_gcs: bool = Field( + default=True, + description="When enabled, column-level lineage will be extracted from the gcs.", + ) + max_query_duration: timedelta = Field( default=timedelta(minutes=15), description="Correction to pad start_time and end_time with. For handling the case where the read happens within our time range but the query completion event is delayed and happens after the configured end time.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 88c6eb1885f3bd..7e8b2931282fff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -157,6 +157,8 @@ class BigQueryV2Report( num_filtered_query_events: int = 0 num_usage_query_hash_collisions: int = 0 num_operational_stats_workunits_emitted: int = 0 + num_lineage_dropped_gcs_path: int = 0 + snapshots_scanned: int = 0 # view lineage @@ -185,6 +187,7 @@ class BigQueryV2Report( usage_start_time: Optional[datetime] = None usage_end_time: Optional[datetime] = None stateful_usage_ingestion_enabled: bool = False + num_skipped_external_table_lineage: int = 0 queries_extractor: Optional[BigQueryQueriesExtractorReport] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 6361d5f266cb74..4f18c22c108a6a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -121,6 +121,7 @@ class BigqueryTable(BaseTable): columns_ignore_from_profiling: List[str] = field(default_factory=list) external: bool = False constraints: List[BigqueryTableConstraint] = field(default_factory=list) + table_type: Optional[str] = None @dataclass @@ -377,6 +378,7 @@ def _make_bigquery_table( return BigqueryTable( name=table.table_name, created=table.created, + table_type=table.table_type, last_altered=( datetime.fromtimestamp( table.get("last_altered") / 1000, tz=timezone.utc diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index dc53e2f74959e2..907e5c12e99a1c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -204,6 +204,11 @@ def __init__( self.view_definitions: FileBackedDict[str] = FileBackedDict() # Maps snapshot ref -> Snapshot self.snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot] = FileBackedDict() + # Add External BQ table + self.external_tables: Dict[str, BigqueryTable] = defaultdict() + self.bq_external_table_pattern = ( + r".*create\s+external\s+table\s+`?(?:project_id\.)?.*`?" + ) bq_project = ( self.config.project_on_behalf @@ -957,6 +962,15 @@ def gen_dataset_workunits( project_id, dataset_name, table.name ) + # Added for bigquery to gcs lineage extraction + if ( + isinstance(table, BigqueryTable) + and table.table_type == "EXTERNAL" + and table.ddl is not None + and re.search(self.bq_external_table_pattern, table.ddl, re.IGNORECASE) + ): + self.external_tables[dataset_urn] = table + status = Status(removed=False) yield MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=status diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index c9d0738bea7dca..b542992a7924a0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -1,6 +1,8 @@ import collections import itertools +import json import logging +import re from dataclasses import dataclass from datetime import datetime, timezone from typing import ( @@ -15,17 +17,20 @@ Tuple, Union, ) +from urllib.parse import urlparse import humanfriendly import sqlglot from google.cloud.datacatalog import lineage_v1 from google.cloud.logging_v2.client import Client as GCPLoggingClient +from datahub.api.entities.dataset.dataset import Dataset from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( AuditLogEntry, BigQueryAuditMetadata, @@ -51,16 +56,19 @@ BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE, bigquery_audit_metadata_query_template_lineage, ) +from datahub.ingestion.source.gcs import gcs_utils from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, ) from datahub.ingestion.source_report.ingestion_stage import LINEAGE_EXTRACTION +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata from datahub.metadata.schema_classes import ( AuditStampClass, DatasetLineageTypeClass, FineGrainedLineageClass, FineGrainedLineageDownstreamTypeClass, FineGrainedLineageUpstreamTypeClass, + SchemaMetadataClass, UpstreamClass, UpstreamLineageClass, ) @@ -247,6 +255,7 @@ def __init__( format_queries=True, ) self.report.sql_aggregator = self.aggregator.report + self.gcs_uris_regex = re.compile(r"uris=\[([^\]]+)\]") def get_time_window(self) -> Tuple[datetime, datetime]: if self.redundant_run_skip_handler: @@ -918,3 +927,190 @@ def test_capability(self, project_id: str) -> None: def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: self.redundant_run_skip_handler.report_current_run_status(step, status) + + def gen_lineage_workunits_for_external_table( + self, + dataset_urn: str, + ddl: Optional[str], + graph: Optional[DataHubGraph] = None, + ) -> Iterable[MetadataWorkUnit]: + + if not ddl: + return + + # Expect URIs in `uris=[""]` format + uris_match = self.gcs_uris_regex.search(ddl) + if not uris_match: + self.report.num_skipped_external_table_lineage += 1 + logger.warning(f"Unable to parse GCS URI from the provided DDL {ddl}.") + return + + uris_str = uris_match.group(1) + try: + source_uris = json.loads(f"[{uris_str}]") + except json.JSONDecodeError as e: + self.report.num_skipped_external_table_lineage += 1 + logger.warning( + f"Json load failed on loading source uri with error: {e}. The field value was: {uris_str}" + ) + return + + lineage_info = self.get_lineage_for_external_table( + dataset_urn=dataset_urn, + source_uris=source_uris, + graph=graph, + ) + + if lineage_info: + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=lineage_info + ).as_workunit() + + def get_lineage_for_external_table( + self, + dataset_urn: str, + source_uris: List[str], + graph: Optional[DataHubGraph] = None, + ) -> Optional[UpstreamLineageClass]: + + upstreams_list: List[UpstreamClass] = [] + fine_grained_lineages: List[FineGrainedLineageClass] = [] + gcs_urns: Set[str] = set() + + for source_uri in source_uris: + # Check that storage_location have the gs:// prefix. + # Right now we are only supporting GCS lineage + if not gcs_utils.is_gcs_uri(source_uri): + continue + gcs_path = self._get_gcs_path(source_uri) + + if gcs_path is None: + continue + + path = gcs_utils.strip_gcs_prefix(gcs_path) + urn = mce_builder.make_dataset_urn_with_platform_instance( + platform="gcs", + name=path, + env=self.config.env, + platform_instance=( + self.config.platform_instance + if self.config.platform_instance is not None + else None + ), + ) + gcs_urns.add(urn) + + upstreams_list.extend( + [ + UpstreamClass( + dataset=source_dataset_urn, + type=DatasetLineageTypeClass.COPY, + ) + for source_dataset_urn in gcs_urns + ] + ) + + if not upstreams_list: + return None + + if self.config.include_column_lineage_with_gcs: + assert graph + schema_metadata: Optional[SchemaMetadataClass] = graph.get_schema_metadata( + dataset_urn + ) + for gcs_dataset_urn in gcs_urns: + schema_metadata_for_gcs: Optional[ + SchemaMetadataClass + ] = graph.get_schema_metadata(gcs_dataset_urn) + if schema_metadata and schema_metadata_for_gcs: + fine_grained_lineage = self.get_fine_grained_lineages_with_gcs( + dataset_urn, + gcs_dataset_urn, + schema_metadata, + schema_metadata_for_gcs, + ) + if not fine_grained_lineage: + logger.warning( + f"Failed to retrieve fine-grained lineage for dataset {dataset_urn} and GCS {gcs_dataset_urn}. " + f"Check schema metadata: {schema_metadata} and GCS metadata: {schema_metadata_for_gcs}." + ) + continue + + fine_grained_lineages.extend(fine_grained_lineage) + + upstream_lineage = UpstreamLineageClass( + upstreams=upstreams_list, fineGrainedLineages=fine_grained_lineages or None + ) + return upstream_lineage + + def _get_gcs_path(self, path: str) -> Optional[str]: + if self.config.gcs_lineage_config: + for path_spec in self.config.gcs_lineage_config.path_specs: + if not path_spec.allowed(path): + logger.debug( + f"Skipping gcs path {path} as it does not match any path spec." + ) + self.report.num_lineage_dropped_gcs_path += 1 + continue + + _, table_path = path_spec.extract_table_name_and_path(path) + return table_path + + if ( + self.config.gcs_lineage_config.ignore_non_path_spec_path + and len(self.config.gcs_lineage_config.path_specs) > 0 + ): + self.report.num_lineage_dropped_gcs_path += 1 + logger.debug( + f"Skipping gcs path {path} as it does not match any path spec." + ) + return None + + if self.config.gcs_lineage_config.strip_urls: + if "/" in urlparse(path).path: + return str(path.rsplit("/", 1)[0]) + + return path + + def get_fine_grained_lineages_with_gcs( + self, + dataset_urn: str, + gcs_dataset_urn: str, + schema_metadata: SchemaMetadata, + schema_metadata_for_gcs: SchemaMetadata, + ) -> Optional[List[FineGrainedLineageClass]]: + def simplify_field_path(field_path): + return Dataset._simplify_field_path(field_path) + + if schema_metadata and schema_metadata_for_gcs: + fine_grained_lineages: List[FineGrainedLineageClass] = [] + for field in schema_metadata.fields: + field_path_v1 = simplify_field_path(field.fieldPath) + matching_gcs_field = next( + ( + f + for f in schema_metadata_for_gcs.fields + if simplify_field_path(f.fieldPath) == field_path_v1 + ), + None, + ) + if matching_gcs_field: + fine_grained_lineages.append( + FineGrainedLineageClass( + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + mce_builder.make_schema_field_urn( + dataset_urn, field_path_v1 + ) + ], + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + mce_builder.make_schema_field_urn( + gcs_dataset_urn, + simplify_field_path(matching_gcs_field.fieldPath), + ) + ], + ) + ) + return fine_grained_lineages + return None diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py index 7456f2fd1d91c2..415977b0f8467b 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py @@ -1,19 +1,25 @@ import datetime -from typing import Dict, List, Set +from typing import Dict, List, Optional, Set import pytest +import datahub.metadata.schema_classes as models +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigQueryTableRef, QueryEvent, ) -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_config import ( + BigQueryV2Config, + GcsLineageProviderConfig, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.common import BigQueryIdentifierBuilder from datahub.ingestion.source.bigquery_v2.lineage import ( BigqueryLineageExtractor, LineageEdge, ) +from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.sql_parsing.schema_resolver import SchemaResolver @@ -135,3 +141,220 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: upstream_lineage.fineGrainedLineages and len(upstream_lineage.fineGrainedLineages) == 2 ) + + +def test_lineage_for_external_bq_table(mock_datahub_graph_instance): + + pipeline_context = PipelineContext(run_id="bq_gcs_lineage") + pipeline_context.graph = mock_datahub_graph_instance + + def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: + return models.SchemaMetadataClass( + schemaName="sample_schema", + platform="urn:li:dataPlatform:gcs", # important <- platform must be an urn + version=0, + hash="", + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="age", + type=models.SchemaFieldDataTypeClass(type=models.NumberTypeClass()), + nativeDataType="int", + ), + models.SchemaFieldClass( + fieldPath="firstname", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + ), + models.SchemaFieldClass( + fieldPath="lastname", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + ), + ], + ) + + pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore + path_specs: List[PathSpec] = [ + PathSpec(include="gs://bigquery_data/{table}/*.parquet"), + PathSpec(include="gs://bigquery_data/customer3/{table}/*.parquet"), + ] + gcs_lineage_config: GcsLineageProviderConfig = GcsLineageProviderConfig( + path_specs=path_specs + ) + + config = BigQueryV2Config( + include_table_lineage=True, + include_column_lineage_with_gcs=True, + gcs_lineage_config=gcs_lineage_config, + ) + + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, + report, + schema_resolver=SchemaResolver(platform="bigquery"), + identifiers=BigQueryIdentifierBuilder(config, report), + ) + + upstream_lineage = extractor.get_lineage_for_external_table( + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", + source_uris=[ + "gs://bigquery_data/customer1/*.parquet", + "gs://bigquery_data/customer2/*.parquet", + "gs://bigquery_data/customer3/my_table/*.parquet", + ], + graph=pipeline_context.graph, + ) + + expected_schema_field_urns = [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer1,PROD),age)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer1,PROD),firstname)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer1,PROD),lastname)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer2,PROD),age)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer2,PROD),firstname)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer2,PROD),lastname)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer3/my_table,PROD),age)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer3/my_table,PROD),firstname)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer3/my_table,PROD),lastname)", + ] + assert upstream_lineage + assert len(upstream_lineage.upstreams) == 3 + assert ( + upstream_lineage.fineGrainedLineages + and len(upstream_lineage.fineGrainedLineages) == 9 + ) + # Extracting column URNs from upstream_lineage.upstreams + actual_schema_field_urns = [ + fine_grained_lineage.upstreams[0] + if fine_grained_lineage.upstreams is not None + else [] + for fine_grained_lineage in upstream_lineage.fineGrainedLineages + ] + assert all( + urn in expected_schema_field_urns for urn in actual_schema_field_urns + ), "Some expected column URNs are missing from fine grained lineage." + + +def test_lineage_for_external_bq_table_no_column_lineage(mock_datahub_graph_instance): + + pipeline_context = PipelineContext(run_id="bq_gcs_lineage") + pipeline_context.graph = mock_datahub_graph_instance + + def fake_schema_metadata(entity_urn: str) -> Optional[models.SchemaMetadataClass]: + return None + + pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore + path_specs: List[PathSpec] = [ + PathSpec(include="gs://bigquery_data/{table}/*.parquet"), + PathSpec(include="gs://bigquery_data/customer3/{table}/*.parquet"), + ] + gcs_lineage_config: GcsLineageProviderConfig = GcsLineageProviderConfig( + path_specs=path_specs + ) + + config = BigQueryV2Config( + include_table_lineage=True, + include_column_lineage_with_gcs=True, + gcs_lineage_config=gcs_lineage_config, + ) + + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, + report, + schema_resolver=SchemaResolver(platform="bigquery"), + identifiers=BigQueryIdentifierBuilder(config, report), + ) + + upstream_lineage = extractor.get_lineage_for_external_table( + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", + source_uris=[ + "gs://bigquery_data/customer1/*.parquet", + "gs://bigquery_data/customer2/*.parquet", + "gs://bigquery_data/customer3/my_table/*.parquet", + ], + graph=pipeline_context.graph, + ) + + expected_dataset_urns = [ + "urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer2,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:gcs,bigquery_data/customer3/my_table,PROD)", + ] + assert upstream_lineage + assert len(upstream_lineage.upstreams) == 3 + # Extracting dataset URNs from upstream_lineage.upstreams + actual_dataset_urns = [upstream.dataset for upstream in upstream_lineage.upstreams] + assert all( + urn in actual_dataset_urns for urn in expected_dataset_urns + ), "Some expected dataset URNs are missing from upstream lineage." + assert upstream_lineage.fineGrainedLineages is None + + +def test_lineage_for_external_table_with_non_gcs_uri(mock_datahub_graph_instance): + pipeline_context = PipelineContext(run_id="non_gcs_lineage") + pipeline_context.graph = mock_datahub_graph_instance + + config = BigQueryV2Config( + include_table_lineage=True, + include_column_lineage_with_gcs=False, # Column lineage disabled for simplicity + ) + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, + report, + schema_resolver=SchemaResolver(platform="bigquery"), + identifiers=BigQueryIdentifierBuilder(config, report), + ) + + upstream_lineage = extractor.get_lineage_for_external_table( + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", + source_uris=[ + "https://some_non_gcs_path/customer1/file.csv", + "https://another_path/file.txt", + ], + graph=pipeline_context.graph, + ) + + assert upstream_lineage is None + + +def test_lineage_for_external_table_path_not_matching_specs( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="path_not_matching_lineage") + pipeline_context.graph = mock_datahub_graph_instance + + path_specs: List[PathSpec] = [ + PathSpec(include="gs://different_data/db2/db3/{table}/*.parquet"), + ] + gcs_lineage_config: GcsLineageProviderConfig = GcsLineageProviderConfig( + path_specs=path_specs, ignore_non_path_spec_path=True + ) + config = BigQueryV2Config( + include_table_lineage=True, + include_column_lineage_with_gcs=False, + gcs_lineage_config=gcs_lineage_config, + ) + + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, + report, + schema_resolver=SchemaResolver(platform="bigquery"), + identifiers=BigQueryIdentifierBuilder(config, report), + ) + + upstream_lineage = extractor.get_lineage_for_external_table( + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", + source_uris=[ + "gs://bigquery_data/customer1/*.parquet", + "gs://bigquery_data/customer2/*.parquet", + ], + graph=pipeline_context.graph, + ) + + assert upstream_lineage is None From 8c53c5408194e1a384045c3464a89dca97e80367 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Tue, 29 Oct 2024 13:49:37 +0530 Subject: [PATCH 04/11] feat(ingestion/powerbi): ingest powerbi app (#11629) --- .../docs/sources/powerbi/powerbi_pre.md | 42 +-- .../ingestion/source/common/subtypes.py | 1 + .../ingestion/source/powerbi/config.py | 5 + .../ingestion/source/powerbi/powerbi.py | 98 ++++++- .../powerbi/rest_api_wrapper/data_classes.py | 42 ++- .../powerbi/rest_api_wrapper/data_resolver.py | 67 +++++ .../powerbi/rest_api_wrapper/powerbi_api.py | 90 +++++++ .../powerbi/golden_test_app_ingest.json | 242 ++++++++++++++++++ .../workspace_with_app_mock_response.json | 149 +++++++++++ .../tests/integration/powerbi/test_powerbi.py | 123 +++++++++ 10 files changed, 837 insertions(+), 22 deletions(-) create mode 100644 metadata-ingestion/tests/integration/powerbi/golden_test_app_ingest.json create mode 100644 metadata-ingestion/tests/integration/powerbi/mock_data/workspace_with_app_mock_response.json diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index f2745d5e77f497..1561a36d04c0c2 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -10,16 +10,17 @@ ## Concept mapping -| PowerBI | Datahub | -|-----------------------|---------------------| -| `Dashboard` | `Dashboard` | -| `Dataset's Table` | `Dataset` | -| `Tile` | `Chart` | -| `Report.webUrl` | `Chart.externalUrl` | -| `Workspace` | `Container` | -| `Report` | `Dashboard` | -| `PaginatedReport` | `Dashboard` | -| `Page` | `Chart` | +| PowerBI | Datahub | +|-------------------|---------------------| +| `Dashboard` | `Dashboard` | +| `Dataset's Table` | `Dataset` | +| `Tile` | `Chart` | +| `Report.webUrl` | `Chart.externalUrl` | +| `Workspace` | `Container` | +| `Report` | `Dashboard` | +| `PaginatedReport` | `Dashboard` | +| `Page` | `Chart` | +| `App` | `Dashboard` | - If `Tile` is created from report then `Chart.externalUrl` is set to Report.webUrl. - The `Page` is unavailable for PowerBI PaginatedReport. @@ -102,7 +103,7 @@ combine_result `Pattern-2` is *not* supported for upstream table lineage extraction as it uses nested item-selector i.e. {Source{[Schema="public",Item="book"]}[Data], Source{[Schema="public",Item="issue_history"]}[Data]} as argument to M-QUery table function i.e. Table.Combine -`Pattern-1` is supported as it first assign the table from schema to variable and then variable is used in M-Query Table function i.e. Table.Combine +`Pattern-1` is supported as it first assigns the table from schema to variable and then variable is used in M-Query Table function i.e. Table.Combine ## Extract endorsements to tags @@ -112,18 +113,20 @@ Please note that the default implementation overwrites tags for the ingested ent ## Profiling -The profiling implementation is done through querying [DAX query endpoint](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries). Therefore the principal needs to have permission to query the datasets to be profiled. Usually this means that the service principal should have `Contributor` role for the workspace to be ingested. Profiling is done with column based queries to be able to handle wide datasets without timeouts. +The profiling implementation is done through querying [DAX query endpoint](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries). Therefore, the principal needs to have permission to query the datasets to be profiled. Usually this means that the service principal should have `Contributor` role for the workspace to be ingested. Profiling is done with column-based queries to be able to handle wide datasets without timeouts. -Take into account that the profiling implementation exeutes fairly big amount of DAX queries and for big datasets this is substantial load to the PowerBI system. +Take into account that the profiling implementation executes a fairly big number of DAX queries, and for big datasets this is a significant load to the PowerBI system. -The `profiling_pattern` setting may be used to limit profiling actions to only a certain set of resources in PowerBI. Both allow and deny rules are matched against following pattern for every table in a PowerBI Dataset: `workspace_name.dataset_name.table_name`. User may limit profiling with these settings at table level, dataset level or workspace level. +The `profiling_pattern` setting may be used to limit profiling actions to only a certain set of resources in PowerBI. Both allowed and deny rules are matched against the following pattern for every table in a PowerBI Dataset: `workspace_name.dataset_name.table_name`. Users may limit profiling with these settings at table level, dataset level or workspace level. ## Admin Ingestion vs. Basic Ingestion PowerBI provides two sets of API i.e. [Basic API and Admin API](https://learn.microsoft.com/en-us/rest/api/power-bi/). -The Basic API returns metadata of PowerBI resources where service principal has granted access explicitly on resources whereas Admin API returns metadata of all PowerBI resources irrespective of whether service principal has granted or doesn't granted access explicitly on resources. +The Basic API returns metadata of PowerBI resources where service principal has granted access explicitly on resources, +whereas Admin API returns metadata of all PowerBI resources irrespective of whether service principal has granted +or doesn't grant access explicitly on resources. -The Admin Ingestion (explain below) is the recommended way to execute PowerBI ingestion as this ingestion can extract most of the metadata. +The Admin Ingestion (explained below) is the recommended way to execute PowerBI ingestion as this ingestion can extract most of the metadata. ### Admin Ingestion: Service Principal As Admin in Tenant Setting and Added as Member In Workspace @@ -142,8 +145,9 @@ PowerBI Source would be able to ingest below listed metadata of that particular - Endorsement as tag - Dashboards - Reports - - Dashboard's Tiles - - Report's Pages + - Dashboard Tiles + - Report Pages + - App If you don't want to add a service principal as a member in your workspace, then you can enable the `admin_apis_only: true` in recipe to use PowerBI Admin API only. @@ -154,7 +158,7 @@ Caveats of setting `admin_apis_only` to `true`: ### Basic Ingestion: Service Principal As Member In Workspace -If you have added service principal as `member` in workspace then PowerBI Source would be able ingest below metadata of that particular workspace +If you have added service principal as `member` in workspace then PowerBI Source would be able to ingest below metadata of that particular workspace - Dashboards - Reports diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py index b6aa8c1f5f1f17..7271bf6102639f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py @@ -70,6 +70,7 @@ class BIAssetSubTypes(StrEnum): # PowerBI POWERBI_TILE = "PowerBI Tile" POWERBI_PAGE = "PowerBI Page" + POWERBI_APP = "App" # Mode MODE_REPORT = "Report" diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 0716a658b61c6f..8a3f8ed6131a21 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -473,6 +473,11 @@ class PowerBiDashboardSourceConfig( "To maintain backward compatibility, this is set to False.", ) + extract_app: bool = pydantic.Field( + default=False, + description="Whether to ingest workspace app. Requires DataHub server 0.14.2+.", + ) + @root_validator(skip_on_failure=True) def validate_extract_column_level_lineage(cls, values: Dict) -> Dict: flags = [ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index f5c0aedb329cdc..72336afbaacd05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -4,6 +4,7 @@ # ######################################################### import logging +from datetime import datetime from typing import Iterable, List, Optional, Tuple, Union import datahub.emitter.mce_builder as builder @@ -59,6 +60,7 @@ FineGrainedLineageUpstreamType, ) from datahub.metadata.schema_classes import ( + AuditStampClass, BrowsePathsClass, ChangeTypeClass, ChartInfoClass, @@ -70,6 +72,7 @@ DatasetLineageTypeClass, DatasetProfileClass, DatasetPropertiesClass, + EdgeClass, GlobalTagsClass, OtherSchemaClass, OwnerClass, @@ -1006,7 +1009,9 @@ def to_chart_mcps( ) # Browse path - browse_path = BrowsePathsClass(paths=[f"/powerbi/{workspace.name}"]) + browse_path = BrowsePathsClass( + paths=[f"/{Constant.PLATFORM_NAME}/{workspace.name}"] + ) browse_path_mcp = self.new_mcp( entity_urn=chart_urn, aspect=browse_path, @@ -1306,6 +1311,95 @@ def extract_independent_datasets( ) ) + def emit_app( + self, workspace: powerbi_data_classes.Workspace + ) -> Iterable[MetadataChangeProposalWrapper]: + if workspace.app is None: + return + + if not self.source_config.extract_app: + self.reporter.info( + title="App Ingestion Is Disabled", + message="You are missing workspace app metadata. Please set flag `extract_app` to `true` in recipe to ingest workspace app.", + context=f"workspace-name={workspace.name}, app-name = {workspace.app.name}", + ) + return + + assets_within_app: List[EdgeClass] = [ + EdgeClass( + destinationUrn=builder.make_dashboard_urn( + platform=self.source_config.platform_name, + platform_instance=self.source_config.platform_instance, + name=powerbi_data_classes.Dashboard.get_urn_part_by_id( + app_dashboard.original_dashboard_id + ), + ) + ) + for app_dashboard in workspace.app.dashboards + ] + + assets_within_app.extend( + [ + EdgeClass( + destinationUrn=builder.make_dashboard_urn( + platform=self.source_config.platform_name, + platform_instance=self.source_config.platform_instance, + name=powerbi_data_classes.Report.get_urn_part_by_id( + app_report.original_report_id + ), + ) + ) + for app_report in workspace.app.reports + ] + ) + + if assets_within_app: + logger.debug( + f"Emitting metadata-workunits for app {workspace.app.name}({workspace.app.id})" + ) + + app_urn: str = builder.make_dashboard_urn( + platform=self.source_config.platform_name, + platform_instance=self.source_config.platform_instance, + name=powerbi_data_classes.App.get_urn_part_by_id(workspace.app.id), + ) + + dashboard_info: DashboardInfoClass = DashboardInfoClass( + title=workspace.app.name, + description=workspace.app.description + if workspace.app.description + else workspace.app.name, + # lastModified=workspace.app.last_update, + lastModified=ChangeAuditStamps( + lastModified=AuditStampClass( + actor="urn:li:corpuser:unknown", + time=int( + datetime.strptime( + workspace.app.last_update, "%Y-%m-%dT%H:%M:%S.%fZ" + ).timestamp() + ), + ) + if workspace.app.last_update + else None + ), + dashboards=assets_within_app, + ) + + # Browse path + browse_path: BrowsePathsClass = BrowsePathsClass( + paths=[f"/powerbi/{workspace.name}"] + ) + + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=app_urn, + aspects=( + dashboard_info, + browse_path, + StatusClass(removed=False), + SubTypesClass(typeNames=[BIAssetSubTypes.POWERBI_APP]), + ), + ) + def get_workspace_workunit( self, workspace: powerbi_data_classes.Workspace ) -> Iterable[MetadataWorkUnit]: @@ -1318,6 +1412,8 @@ def get_workspace_workunit( # Return workunit to a Datahub Ingestion framework yield workunit + yield from auto_workunit(self.emit_app(workspace=workspace)) + for dashboard in workspace.dashboards: try: # Fetch PowerBi users for dashboards diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py index d54b4a42b742e0..9407ef7a51b58e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py @@ -37,6 +37,35 @@ class DatasetKey(ContainerKey): dataset: str +@dataclass +class AppDashboard: + id: str + original_dashboard_id: str + + +@dataclass +class AppReport: + id: str + original_report_id: str + + +@dataclass +class App: + id: str + name: str + description: Optional[str] + last_update: Optional[str] + dashboards: List["AppDashboard"] + reports: List["AppReport"] + + def get_urn_part(self): + return App.get_urn_part_by_id(self.id) + + @staticmethod + def get_urn_part_by_id(id_: str) -> str: + return f"apps.{id_}" + + @dataclass class Workspace: id: str @@ -49,6 +78,7 @@ class Workspace: dashboard_endorsements: Dict[str, List[str]] scan_result: dict independent_datasets: List["PowerBIDataset"] + app: Optional["App"] def get_urn_part(self, workspace_id_as_urn_part: Optional[bool] = False) -> str: # shouldn't use workspace name, as they can be the same? @@ -235,7 +265,11 @@ class Report: tags: List[str] def get_urn_part(self): - return f"reports.{self.id}" + return Report.get_urn_part_by_id(self.id) + + @staticmethod + def get_urn_part_by_id(id_: str) -> str: + return f"reports.{id_}" @dataclass @@ -273,7 +307,11 @@ class Dashboard: webUrl: Optional[str] def get_urn_part(self): - return f"dashboards.{self.id}" + return Dashboard.get_urn_part_by_id(self.id) + + @staticmethod + def get_urn_part_by_id(id_: str) -> str: + return f"dashboards.{id_}" def __members(self): return (self.id,) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index 8849e19ea86228..f8fff2391d10b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -13,6 +13,7 @@ from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.ingestion.source.powerbi.config import Constant from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import ( + App, Column, Dashboard, Measure, @@ -143,6 +144,13 @@ def get_dataset_parameters( def get_users(self, workspace_id: str, entity: str, entity_id: str) -> List[User]: pass + @abstractmethod + def _get_app( + self, + app_id: str, + ) -> Optional[Dict]: + pass + def _get_authority_url(self): return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}" @@ -411,6 +419,37 @@ def itr_pages( page_number += 1 + def get_app( + self, + app_id: str, + ) -> Optional[App]: + + raw_app: Optional[Dict] = self._get_app( + app_id=app_id, + ) + + if raw_app is None: + return None + + assert ( + Constant.ID in raw_app + ), f"{Constant.ID} is required field not present in server response" + + assert ( + Constant.NAME in raw_app + ), f"{Constant.NAME} is required field not present in server response" + + return App( + id=raw_app[Constant.ID], + name=raw_app[Constant.NAME], + description=raw_app.get(Constant.DESCRIPTION), + last_update=raw_app.get(Constant.LAST_UPDATE), + dashboards=[], # dashboards and reports of App are available in scan-result response + reports=[], # There is an App section in documentation https://learn.microsoft.com/en-us/rest/api/power-bi/dashboards/get-dashboards-in-group#code-try-0 + # However the report API mentioned in that section is not returning the reports + # We will collect these details from the scan-result. + ) + class RegularAPIResolver(DataResolverBase): # Regular access endpoints @@ -680,6 +719,15 @@ def profile_dataset( table.column_count = column_count + def _get_app( + self, + app_id: str, + ) -> Optional[Dict]: + # [Date: 2024/10/18] As per API doc, the service principal approach is not supported for regular API + # https://learn.microsoft.com/en-us/rest/api/power-bi/apps/get-app + + return None + class AdminAPIResolver(DataResolverBase): # Admin access endpoints @@ -993,3 +1041,22 @@ def profile_dataset( ) -> None: logger.debug("Profile dataset is unsupported in Admin API") return None + + def _get_app( + self, + app_id: str, + ) -> Optional[Dict]: + + app_endpoint = self.API_ENDPOINTS[Constant.GET_WORKSPACE_APP].format( + POWERBI_ADMIN_BASE_URL=DataResolverBase.ADMIN_BASE_URL, + APP_ID=app_id, + ) + # Hit PowerBi + logger.debug(f"Request to app URL={app_endpoint}") + + for page in self.itr_pages(endpoint=app_endpoint): + for app in page: + if Constant.ID in app and app_id == app[Constant.ID]: + return app + + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py index 37793bc32980b4..b67f257d9eb5bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py @@ -13,6 +13,9 @@ from datahub.ingestion.source.powerbi.rest_api_wrapper import data_resolver from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import ( FIELD_TYPE_MAPPING, + App, + AppDashboard, + AppReport, Column, Dashboard, Measure, @@ -264,6 +267,7 @@ def get_workspaces(self) -> List[Workspace]: dashboard_endorsements={}, scan_result={}, independent_datasets=[], + app=None, # It will be populated in _fill_metadata_from_scan_result method ) for workspace in groups ] @@ -423,6 +427,87 @@ def _get_workspace_datasets(self, workspace: Workspace) -> dict: dataset_instance.tables.append(table) return dataset_map + def get_app( + self, + app_id: str, + ) -> Optional[App]: + return self.__admin_api_resolver.get_app( + app_id=app_id, + ) + + def _populate_app_details( + self, workspace: Workspace, workspace_metadata: Dict + ) -> None: + # App_id is not present at the root level of workspace_metadata. + # It can be found in the workspace_metadata.dashboards or workspace_metadata.reports lists. + + # Workspace_metadata contains duplicate entries for all dashboards and reports that we have included + # in the app. + # The duplicate entries for a report contain key `originalReportObjectId` referencing to + # an actual report id of workspace. The duplicate entries for a dashboard contain `displayName` where + # displayName is generated from displayName of original dashboard with prefix "App" + app_id: Optional[str] = None + app_reports: List[AppReport] = [] + # Filter app reports + for report in workspace_metadata.get(Constant.REPORTS) or []: + if report.get(Constant.APP_ID): + app_reports.append( + AppReport( + id=report[Constant.ID], + original_report_id=report[Constant.ORIGINAL_REPORT_OBJECT_ID], + ) + ) + if app_id is None: # In PowerBI one workspace can have one app + app_id = report.get(Constant.APP_ID) + + raw_app_dashboards: List[Dict] = [] + # Filter app dashboards + for dashboard in workspace_metadata.get(Constant.DASHBOARDS) or []: + if dashboard.get(Constant.APP_ID): + raw_app_dashboards.append(dashboard) + if app_id is None: # In PowerBI, one workspace contains one app + app_id = report[Constant.APP_ID] + + # workspace doesn't have an App. Above two loops can be avoided + # if app_id is available at root level in workspace_metadata + if app_id is None: + logger.debug(f"Workspace {workspace.name} does not contain an app.") + return + + app: Optional[App] = self.get_app(app_id=app_id) + if app is None: + self.__reporter.info( + title="App Not Found", + message="The workspace includes an app, but its metadata is missing from the API response.", + context=f"workspace_name={workspace.name}", + ) + return + + # Map to find out which dashboards belongs to the App + workspace_dashboard_map: Dict[str, Dict] = { + raw_dashboard[Constant.DISPLAY_NAME]: raw_dashboard + for raw_dashboard in raw_app_dashboards + } + + app_dashboards: List[AppDashboard] = [] + for dashboard in workspace_metadata.get(Constant.DASHBOARDS) or []: + app_dashboard_display_name = f"[App] {dashboard[Constant.DISPLAY_NAME]}" # A Dashboard is considered part of an App if the workspace_metadata contains a Dashboard with a label formatted as "[App] ". + if ( + app_dashboard_display_name in workspace_dashboard_map + ): # This dashboard is part of the App + app_dashboards.append( + AppDashboard( + id=workspace_dashboard_map[app_dashboard_display_name][ + Constant.ID + ], + original_dashboard_id=dashboard[Constant.ID], + ) + ) + + app.reports = app_reports + app.dashboards = app_dashboards + workspace.app = app + def _fill_metadata_from_scan_result( self, workspaces: List[Workspace], @@ -463,6 +548,7 @@ def _fill_metadata_from_scan_result( dashboard_endorsements={}, scan_result={}, independent_datasets=[], + app=None, # It is getting set from scan-result ) cur_workspace.scan_result = workspace_metadata cur_workspace.datasets = self._get_workspace_datasets(cur_workspace) @@ -482,6 +568,10 @@ def _fill_metadata_from_scan_result( "false " ) + self._populate_app_details( + workspace=cur_workspace, + workspace_metadata=workspace_metadata, + ) workspaces.append(cur_workspace) return workspaces diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_app_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_app_ingest.json new file mode 100644 index 00000000000000..5988b14977552f --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_app_ingest.json @@ -0,0 +1,242 @@ +[ +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,apps.2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": {}, + "title": "Finance", + "description": "The finance app", + "charts": [], + "datasets": [], + "dashboards": [ + { + "destinationUrn": "urn:li:dashboard:(powerbi,dashboards.744B07E3-FAA7-4BD7-BD17-3220BF0F6301)" + }, + { + "destinationUrn": "urn:li:dashboard:(powerbi,reports.455AB99B-E110-46E6-90D3-F015CABD1156)" + } + ], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1547372813, + "actor": "urn:li:corpuser:unknown" + } + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,apps.2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/Workspace For App Testing" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,apps.2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,apps.2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "App" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,apps.2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Workspace For App Testing" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.A700E2C1-D008-42DF-AFCA-A70A87D0B2A3)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/Workspace For App Testing" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.A700E2C1-D008-42DF-AFCA-A70A87D0B2A3)", + "changeType": "PATCH", + "aspectName": "dashboardInfo", + "aspect": { + "json": [ + { + "op": "add", + "path": "/customProperties/chartCount", + "value": "0" + }, + { + "op": "add", + "path": "/customProperties/workspaceName", + "value": "Workspace For App Testing" + }, + { + "op": "add", + "path": "/customProperties/workspaceId", + "value": "8F756DE6-26AD-45FF-A201-44276FF1F561" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.A700E2C1-D008-42DF-AFCA-A70A87D0B2A3)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.A700E2C1-D008-42DF-AFCA-A70A87D0B2A3)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "dashboardId": "powerbi.linkedin.com/dashboards/A700E2C1-D008-42DF-AFCA-A70A87D0B2A3" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.A700E2C1-D008-42DF-AFCA-A70A87D0B2A3)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Workspace For App Testing" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/mock_data/workspace_with_app_mock_response.json b/metadata-ingestion/tests/integration/powerbi/mock_data/workspace_with_app_mock_response.json new file mode 100644 index 00000000000000..712982741ccaa0 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/mock_data/workspace_with_app_mock_response.json @@ -0,0 +1,149 @@ +{ + "https://api.powerbi.com/v1.0/myorg/groups?%24skip=0&%24top=1000": { + "method": "GET", + "status_code": 200, + "json": { + "value": [ + { + "id": "8F756DE6-26AD-45FF-A201-44276FF1F561", + "isReadOnly": true, + "name": "Workspace For App Testing", + "type": "Workspace", + "state": "Active" + } + ] + } + }, + "https://api.powerbi.com/v1.0/myorg/groups?%24skip=1000&%24top=1000": { + "method": "GET", + "status_code": 200, + "json": { + "value": [] + } + }, + "https://api.powerbi.com/v1.0/myorg/admin/workspaces/scanResult/6147FCEB-7531-4449-8FB6-1F7A5431BF2D": { + "method": "GET", + "status_code": 200, + "json": { + "workspaces": [ + { + "id": "8F756DE6-26AD-45FF-A201-44276FF1F561", + "name": "Workspace For App Testing", + "type": "Workspace", + "state": "Active", + "reports": [ + { + "reportType": "PowerBIReport", + "id": "455AB99B-E110-46E6-90D3-F015CABD1156", + "name": "GitHub Progress", + "datasetId": "2F99BE64-673D-4DA8-BF4F-02629A1F2C8F", + "createdDateTime": "2024-10-01T06:27:00.51", + "modifiedDateTime": "2024-10-01T06:27:00.51", + "modifiedBy": "abc@fake.com", + "createdBy": "abc@fake.com", + "modifiedById": "97ABB057-CB8B-480D-AA1C-B3E7F0A16EC5", + "createdById": "97ABB057-CB8B-480D-AA1C-B3E7F0A16EC5", + "datasetWorkspaceId": "8F756DE6-26AD-45FF-A201-44276FF1F561", + "users": [ + { + "reportUserAccessRight": "Owner", + "emailAddress": "abc@fake.com", + "displayName": "John Smith", + "identifier": "abc@fake.com", + "graphId": "97ABB057-CB8B-480D-AA1C-B3E7F0A16EC5", + "principalType": "User", + "userType": "Member" + } + ] + }, + { + "reportType": "PowerBIReport", + "id": "ecc35189-e67a-4d8a-9037-403317bd5808", + "name": "[App] GitHub Progress", + "datasetId": "2F99BE64-673D-4DA8-BF4F-02629A1F2C8F", + "appId": "2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6", + "createdDateTime": "2024-10-01T06:29:34.75", + "modifiedDateTime": "2024-10-01T07:22:55.397", + "originalReportObjectId": "455AB99B-E110-46E6-90D3-F015CABD1156", + "modifiedBy": "abc@fake.com", + "createdBy": "abc@fake.com", + "modifiedById": "97ABB057-CB8B-480D-AA1C-B3E7F0A16EC5", + "createdById": "97ABB057-CB8B-480D-AA1C-B3E7F0A16EC5", + "datasetWorkspaceId": "8F756DE6-26AD-45FF-A201-44276FF1F561" + } + ], + "dashboards": [ + { + "id": "744B07E3-FAA7-4BD7-BD17-3220BF0F6301", + "displayName": "Pet Overview", + "isReadOnly": false, + "tiles": [ + ] + }, + { + "id": "DB85B962-74BA-4821-900B-59AC5F70AADE", + "appId": "2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6", + "displayName": "[App] Pet Overview", + "isReadOnly": false, + "tiles": [ + ] + } + ], + "datasets": [] + } + ] + } + }, + "https://api.powerbi.com/v1.0/myorg/groups/8F756DE6-26AD-45FF-A201-44276FF1F561/dashboards": { + "method": "GET", + "status_code": 200, + "json": { + "value": [ + { + "id": "A700E2C1-D008-42DF-AFCA-A70A87D0B2A3", + "isReadOnly": true, + "displayName": "test_dashboard", + "description": "Description of test dashboard", + "embedUrl": "https://localhost/dashboards/embed/1", + "webUrl": "https://localhost/dashboards/web/1" + } + ] + } + }, + "https://api.powerbi.com/v1.0/myorg/admin/workspaces/scanStatus/6147FCEB-7531-4449-8FB6-1F7A5431BF2D": { + "method": "GET", + "status_code": 200, + "json": { + "status": "SUCCEEDED" + } + }, + "https://api.powerbi.com/v1.0/myorg/groups/8F756DE6-26AD-45FF-A201-44276FF1F561/dashboards/A700E2C1-D008-42DF-AFCA-A70A87D0B2A3/tiles": { + "method": "GET", + "status_code": 200, + "json": { + "value": [] + } + }, + "https://api.powerbi.com/v1.0/myorg/admin/apps?%24skip=0&%24top=1000": { + "method": "GET", + "status_code": 200, + "json": { + "value": [ + { + "id": "2A4D0E82-E7A4-45B1-BD72-2A2CF82C9CB6", + "description": "The finance app", + "name": "Finance", + "publishedBy": "Bill", + "lastUpdate": "2024-09-26T04:20:34.513Z" + } + ] + } + }, + "https://api.powerbi.com/v1.0/myorg/admin/apps?%24skip=1000&%24top=1000": { + "method": "GET", + "status_code": 200, + "json": { + "value": [] + } + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 78cf1031074779..0f360d44c38cbe 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -80,6 +80,9 @@ def scan_init_response(request, context): "C5DA6EA8-625E-4AB1-90B6-CAEA0BF9F492": { "id": "81B02907-E2A3-45C3-B505-3781839C8CAA", }, + "8F756DE6-26AD-45FF-A201-44276FF1F561": { + "id": "6147FCEB-7531-4449-8FB6-1F7A5431BF2D", + }, } return w_id_vs_response[workspace_id] @@ -1013,6 +1016,7 @@ def validate_pipeline(pipeline: Pipeline) -> None: dashboard_endorsements={}, scan_result={}, independent_datasets=[], + app=None, ) # Fetch actual reports reports: List[Report] = cast( @@ -1490,3 +1494,122 @@ def test_powerbi_cross_workspace_reference_info_message( output_path=f"{tmp_path}/powerbi_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) + + +def common_app_ingest( + pytestconfig: pytest.Config, + requests_mock: Any, + output_mcp_path: str, + override_config: dict = {}, +) -> Pipeline: + enable_logging() + + register_mock_api( + pytestconfig=pytestconfig, + request_mock=requests_mock, + override_data=read_mock_data( + path=pytestconfig.rootpath + / "tests/integration/powerbi/mock_data/workspace_with_app_mock_response.json" + ), + ) + + config = default_source_config() + + del config["workspace_id"] + + config["workspace_id_pattern"] = { + "allow": [ + "8F756DE6-26AD-45FF-A201-44276FF1F561", + ] + } + + config.update(override_config) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **config, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": output_mcp_path, + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + return pipeline + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +@pytest.mark.integration +def test_powerbi_app_ingest( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + + common_app_ingest( + pytestconfig=pytestconfig, + requests_mock=requests_mock, + output_mcp_path=f"{tmp_path}/powerbi_mces.json", + override_config={ + "extract_app": True, + }, + ) + + golden_file = "golden_test_app_ingest.json" + + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/powerbi_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +@pytest.mark.integration +def test_powerbi_app_ingest_info_message( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + + pipeline = common_app_ingest( + pytestconfig=pytestconfig, + requests_mock=requests_mock, + output_mcp_path=f"{tmp_path}/powerbi_mces.json", + ) + + assert isinstance(pipeline.source, PowerBiDashboardSource) # to silent the lint + + info_entries: dict = pipeline.source.reporter._structured_logs._entries.get( + StructuredLogLevel.INFO, {} + ) # type :ignore + + is_entry_present: bool = False + # Printing INFO entries + for key, entry in info_entries.items(): + if entry.title == "App Ingestion Is Disabled": + is_entry_present = True + break + + assert ( + is_entry_present + ), "The extract_app flag should be set to false by default. We need to keep this flag as false until all GMS instances are updated to the latest release." From 20d099960147345bf3ba23522814962872f6416c Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Tue, 29 Oct 2024 13:50:55 +0530 Subject: [PATCH 05/11] feat(ingest/databricks): report unique query count from usage (#11576) --- .../src/datahub/ingestion/source/unity/report.py | 3 +++ .../src/datahub/ingestion/source/unity/usage.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index a00a52ae542076..f4579376a3b3a4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -12,8 +12,10 @@ class UnityCatalogUsagePerfReport(Report): get_queries_timer: PerfTimer = field(default_factory=PerfTimer) sql_parsing_timer: PerfTimer = field(default_factory=PerfTimer) + spark_sql_parsing_timer: PerfTimer = field(default_factory=PerfTimer) aggregator_add_event_timer: PerfTimer = field(default_factory=PerfTimer) gen_operation_timer: PerfTimer = field(default_factory=PerfTimer) + query_fingerprinting_timer: PerfTimer = field(default_factory=PerfTimer) @dataclass @@ -32,6 +34,7 @@ class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport): num_external_upstreams_unsupported: int = 0 num_queries: int = 0 + num_unique_queries: int = 0 num_queries_dropped_parse_failure: int = 0 num_queries_missing_table: int = 0 # Can be due to pattern filter num_queries_duplicate_table: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index 08482c9d2fa3b9..8c42ac81b98cf5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -22,6 +22,7 @@ from datahub.ingestion.source.unity.report import UnityCatalogReport from datahub.ingestion.source.usage.usage_common import UsageAggregator from datahub.metadata.schema_classes import OperationClass +from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint logger = logging.getLogger(__name__) @@ -76,6 +77,7 @@ def _get_workunits_internal( self, table_refs: Set[TableReference] ) -> Iterable[MetadataWorkUnit]: table_map = defaultdict(list) + query_hashes = set() for ref in table_refs: table_map[ref.table].append(ref) table_map[f"{ref.schema}.{ref.table}"].append(ref) @@ -85,6 +87,13 @@ def _get_workunits_internal( for query in self._get_queries(): self.report.num_queries += 1 with current_timer.pause(): + with self.report.usage_perf_report.query_fingerprinting_timer: + query_hashes.add( + get_query_fingerprint( + query.query_text, "databricks", fast=True + ) + ) + self.report.num_unique_queries = len(query_hashes) table_info = self._parse_query(query, table_map) if table_info is not None: if self.config.include_operational_stats: @@ -166,7 +175,8 @@ def _parse_query( with self.report.usage_perf_report.sql_parsing_timer: table_info = self._parse_query_via_lineage_runner(query.query_text) if table_info is None and query.statement_type == QueryStatementType.SELECT: - table_info = self._parse_query_via_spark_sql_plan(query.query_text) + with self.report.usage_perf_report.spark_sql_parsing_timer: + table_info = self._parse_query_via_spark_sql_plan(query.query_text) if table_info is None: self.report.num_queries_dropped_parse_failure += 1 From bea253a064aec1dcab6309820c8acabc7ed70900 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 29 Oct 2024 01:21:20 -0700 Subject: [PATCH 06/11] feat(ingest): unpin traitlets (#11731) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 12614f9ff36b24..63d754e71402e5 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -142,7 +142,7 @@ # https://github.com/great-expectations/great_expectations/pull/5382/files # datahub does not depend on traitlets directly but great expectations does. # https://github.com/ipython/traitlets/issues/741 - "traitlets<5.2.2", + "traitlets!=5.2.2", "greenlet", *cachetools_lib, } From 02f0a3dee716f8d11712d5eca3b23796f256cde1 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Tue, 29 Oct 2024 15:28:41 +0530 Subject: [PATCH 07/11] feat(ingest/transform): extend ownership transformer to other entities (#11700) --- .../transformer/add_dataset_ownership.py | 6 +- .../transformer/dataset_transformer.py | 21 +++++-- .../transformer/pattern_cleanup_ownership.py | 6 +- .../transformer/remove_dataset_ownership.py | 6 +- .../tests/unit/test_transform_dataset.py | 57 ++++++++++++++----- 5 files changed, 64 insertions(+), 32 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py index 54be2e5fac1e30..b107a62c905b4a 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py @@ -13,9 +13,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.graph.client import DataHubGraph -from datahub.ingestion.transformer.dataset_transformer import ( - DatasetOwnershipTransformer, -) +from datahub.ingestion.transformer.dataset_transformer import OwnershipTransformer from datahub.metadata.schema_classes import ( BrowsePathsV2Class, MetadataChangeProposalClass, @@ -37,7 +35,7 @@ class AddDatasetOwnershipConfig(TransformerSemanticsConfigModel): is_container: bool = False -class AddDatasetOwnership(DatasetOwnershipTransformer): +class AddDatasetOwnership(OwnershipTransformer): """Transformer that adds owners to datasets according to a callback function.""" ctx: PipelineContext diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py index 42dd54f4a584a0..00b3a9ba59f924 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py @@ -27,6 +27,22 @@ def entity_types(self) -> List[str]: return ["dataset"] +class OwnershipTransformer( + DatasetTransformer, SingleAspectTransformer, metaclass=ABCMeta +): + def aspect_name(self) -> str: + return "ownership" + + def entity_types(self) -> List[str]: + return [ + "dataset", + "dataJob", + "dataFlow", + "chart", + "dashboard", + ] + + class TagTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): """Transformer that does transform sequentially on each tag.""" @@ -47,11 +63,6 @@ def entity_types(self) -> List[str]: return ["container"] -class DatasetOwnershipTransformer(DatasetTransformer, metaclass=ABCMeta): - def aspect_name(self) -> str: - return "ownership" - - class DatasetDomainTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "domains" diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py index 8ef61ab9679e63..f17546d6f72990 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py @@ -4,9 +4,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.dataset_transformer import ( - DatasetOwnershipTransformer, -) +from datahub.ingestion.transformer.dataset_transformer import OwnershipTransformer from datahub.metadata.schema_classes import ( OwnerClass, OwnershipClass, @@ -20,7 +18,7 @@ class PatternCleanUpOwnershipConfig(ConfigModel): pattern_for_cleanup: List[str] -class PatternCleanUpOwnership(DatasetOwnershipTransformer): +class PatternCleanUpOwnership(OwnershipTransformer): """Transformer that clean the ownership URN.""" ctx: PipelineContext diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py index f5d71a4340554f..934e2a13d56314 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py @@ -3,9 +3,7 @@ from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.dataset_transformer import ( - DatasetOwnershipTransformer, -) +from datahub.ingestion.transformer.dataset_transformer import OwnershipTransformer from datahub.metadata.schema_classes import OwnershipClass @@ -13,7 +11,7 @@ class ClearDatasetOwnershipConfig(ConfigModel): pass -class SimpleRemoveDatasetOwnership(DatasetOwnershipTransformer): +class SimpleRemoveDatasetOwnership(OwnershipTransformer): """Transformer that clears all owners on each dataset.""" def __init__(self, config: ClearDatasetOwnershipConfig, ctx: PipelineContext): diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 4e9a38cb37ae63..389f7b70b3311e 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -220,7 +220,7 @@ def make_dataset_with_properties() -> models.MetadataChangeEventClass: ) -def test_simple_dataset_ownership_transformation(mock_time): +def test_dataset_ownership_transformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = make_dataset_with_owner() @@ -254,7 +254,7 @@ def test_simple_dataset_ownership_transformation(mock_time): transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) - assert len(outputs) == len(inputs) + 1 + assert len(outputs) == len(inputs) + 2 # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( @@ -287,11 +287,21 @@ def test_simple_dataset_ownership_transformation(mock_time): ] ) + third_ownership_aspect = outputs[4].record.aspect + assert third_ownership_aspect + assert len(third_ownership_aspect.owners) == 2 + assert all( + [ + owner.type == models.OwnershipTypeClass.DATAOWNER and owner.typeUrn is None + for owner in second_ownership_aspect.owners + ] + ) + # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record # Verify that the last entry is EndOfStream - assert inputs[3] == outputs[4].record + assert inputs[-1] == outputs[-1].record def test_simple_dataset_ownership_with_type_transformation(mock_time): @@ -1003,6 +1013,7 @@ def test_pattern_dataset_ownership_transformation(mock_time): "rules": { ".*example1.*": [builder.make_user_urn("person1")], ".*example2.*": [builder.make_user_urn("person2")], + ".*dag_abc.*": [builder.make_user_urn("person2")], } }, "ownership_type": "DATAOWNER", @@ -1014,7 +1025,9 @@ def test_pattern_dataset_ownership_transformation(mock_time): transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) - assert len(outputs) == len(inputs) + 1 # additional MCP due to the no-owner MCE + assert ( + len(outputs) == len(inputs) + 2 + ) # additional MCP due to the no-owner MCE + datajob # Check the first entry. assert inputs[0] == outputs[0].record @@ -1042,6 +1055,16 @@ def test_pattern_dataset_ownership_transformation(mock_time): ] ) + third_ownership_aspect = outputs[4].record.aspect + assert third_ownership_aspect + assert len(third_ownership_aspect.owners) == 1 + assert all( + [ + owner.type == models.OwnershipTypeClass.DATAOWNER + for owner in third_ownership_aspect.owners + ] + ) + # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record @@ -1122,14 +1145,14 @@ def fake_get_aspect( pipeline_context.graph.get_aspect = fake_get_aspect # type: ignore # No owner aspect for the first dataset - no_owner_aspect = models.MetadataChangeEventClass( + no_owner_aspect_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)", aspects=[models.StatusClass(removed=False)], ), ) # Dataset with an existing owner - with_owner_aspect = models.MetadataChangeEventClass( + with_owner_aspect_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ @@ -1148,8 +1171,7 @@ def fake_get_aspect( ), ) - # Not a dataset, should be ignored - not_a_dataset = models.MetadataChangeEventClass( + datajob = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ @@ -1163,9 +1185,9 @@ def fake_get_aspect( ) inputs = [ - no_owner_aspect, - with_owner_aspect, - not_a_dataset, + no_owner_aspect_dataset, + with_owner_aspect_dataset, + datajob, EndOfStream(), ] @@ -1176,6 +1198,7 @@ def fake_get_aspect( "rules": { ".*example1.*": [builder.make_user_urn("person1")], ".*example2.*": [builder.make_user_urn("person2")], + ".*dag_abc.*": [builder.make_user_urn("person3")], } }, "ownership_type": "DATAOWNER", @@ -1188,9 +1211,9 @@ def fake_get_aspect( transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) - assert len(outputs) == len(inputs) + 3 + assert len(outputs) == len(inputs) + 4 - # Check the first entry. + # Check that DatasetSnapshotClass has not changed assert inputs[0] == outputs[0].record # Check the ownership for the first dataset (example1) @@ -1217,12 +1240,16 @@ def fake_get_aspect( ] ) + third_ownership_aspect = outputs[4].record.aspect + assert third_ownership_aspect + assert len(third_ownership_aspect.owners) == 1 # new for datajob + # Check container ownerships for i in range(2): - container_ownership_aspect = outputs[i + 4].record.aspect + container_ownership_aspect = outputs[i + 5].record.aspect assert container_ownership_aspect ownership = json.loads(container_ownership_aspect.value.decode("utf-8")) - assert len(ownership) == 2 + assert len(ownership) == 3 assert ownership[0]["value"]["owner"] == builder.make_user_urn("person1") assert ownership[1]["value"]["owner"] == builder.make_user_urn("person2") From b26da579284ab9b340c821844ba01f8cdbcdaf45 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 29 Oct 2024 09:48:55 -0700 Subject: [PATCH 08/11] feat(ingest): remove dep on `termcolor` (#11733) --- metadata-ingestion/setup.py | 2 - .../datahub/cli/specific/dataproduct_cli.py | 2 +- .../src/datahub/cli/timeline_cli.py | 15 +++-- .../src/datahub/ingestion/run/pipeline.py | 8 +-- .../src/datahub/upgrade/upgrade.py | 63 ++++++++++--------- 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 63d754e71402e5..0da0329dc8c8a9 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -42,7 +42,6 @@ "python-dateutil>=2.8.0", "tabulate", "progressbar2", - "termcolor>=1.0.0", "psutil>=5.8.0", "Deprecated", "humanfriendly", @@ -546,7 +545,6 @@ "types-pyOpenSSL", "types-click-spinner>=0.1.13.1", "types-ujson>=5.2.0", - "types-termcolor>=1.0.0", "types-Deprecated", "types-protobuf>=4.21.0.1", "sqlalchemy2-stubs", diff --git a/metadata-ingestion/src/datahub/cli/specific/dataproduct_cli.py b/metadata-ingestion/src/datahub/cli/specific/dataproduct_cli.py index afac38e29722e9..8ec4d3ad249376 100644 --- a/metadata-ingestion/src/datahub/cli/specific/dataproduct_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/dataproduct_cli.py @@ -363,7 +363,7 @@ def remove_owner(urn: str, owner_urn: str) -> None: with get_default_graph() as graph: _abort_if_non_existent_urn(graph, urn, "remove owners") for mcp in dataproduct_patcher.build(): - print(json.dumps(mcp.to_obj())) + click.echo(json.dumps(mcp.to_obj())) graph.emit(mcp) diff --git a/metadata-ingestion/src/datahub/cli/timeline_cli.py b/metadata-ingestion/src/datahub/cli/timeline_cli.py index 08672528abb5da..37089e6f051f0d 100644 --- a/metadata-ingestion/src/datahub/cli/timeline_cli.py +++ b/metadata-ingestion/src/datahub/cli/timeline_cli.py @@ -6,7 +6,6 @@ import click from requests import Response -from termcolor import colored from datahub.emitter.mce_builder import dataset_urn_to_key, schema_field_urn_to_key from datahub.ingestion.graph.client import DataHubGraph, get_default_graph @@ -44,14 +43,14 @@ def pretty_id(id: Optional[str]) -> str: assert schema_field_key is not None field_path = schema_field_key.fieldPath - return f"{colored('field','cyan')}:{colored(pretty_field_path(field_path),'white')}" + return f"{click.style('field', fg='cyan')}:{click.style(pretty_field_path(field_path), fg='white')}" if id.startswith("[version=2.0]"): - return f"{colored('field','cyan')}:{colored(pretty_field_path(id),'white')}" + return f"{click.style('field', fg='cyan')}:{click.style(pretty_field_path(id), fg='white')}" if id.startswith("urn:li:dataset"): dataset_key = dataset_urn_to_key(id) if dataset_key: - return f"{colored('dataset','cyan')}:{colored(dataset_key.platform[len('urn:li:dataPlatform:'):],'white')}:{colored(dataset_key.name,'white')}" + return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:'):], fg='white')}:{click.style(dataset_key.name, fg='white')}" # failed to prettify, return original return id @@ -196,8 +195,8 @@ def timeline( else "red" ) - print( - f"{colored(change_instant,'cyan')} - {colored(change_txn['semVer'],change_color)}" + click.echo( + f"{click.style(change_instant, fg='cyan')} - {click.style(change_txn['semVer'], fg=change_color)}" ) if change_txn["changeEvents"] is not None: for change_event in change_txn["changeEvents"]: @@ -216,8 +215,8 @@ def timeline( or change_event.get("entityUrn") or "" ) - print( - f"\t{colored(change_event.get('changeType') or change_event.get('operation'),event_change_color)} {change_event.get('category')} {target_string} {element_string}: {change_event['description']}" + click.echo( + f"\t{click.style(change_event.get('changeType') or change_event.get('operation'), fg=event_change_color)} {change_event.get('category')} {target_string} {element_string}: {change_event['description']}" ) else: click.echo( diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index a16a3df57d1bcb..81fc7e57176864 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -647,11 +647,11 @@ def _get_text_color(self, running: bool, failures: bool, warnings: bool) -> str: return "cyan" else: if failures: - return "bright_red" + return "red" elif warnings: - return "bright_yellow" + return "yellow" else: - return "bright_green" + return "green" def has_failures(self) -> bool: return bool( @@ -674,7 +674,7 @@ def pretty_print_summary( else: click.echo() click.secho("Cli report:", bold=True) - click.secho(self.cli_report.as_string()) + click.echo(self.cli_report.as_string()) click.secho(f"Source ({self.source_type}) report:", bold=True) click.echo(self.source.get_report().as_string()) click.secho(f"Sink ({self.sink_type}) report:", bold=True) diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index d940dfd78a82ed..dd2829ba0d2365 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -1,15 +1,14 @@ import asyncio import contextlib import logging -import sys from datetime import datetime, timedelta, timezone from functools import wraps from typing import Any, Callable, Optional, Tuple, TypeVar +import click import humanfriendly from packaging.version import Version from pydantic import BaseModel -from termcolor import colored from datahub import __version__ from datahub.cli.config_utils import load_client_config @@ -277,8 +276,8 @@ def maybe_print_upgrade_message( # noqa: C901 if not version_stats: log.debug("No version stats found") return - else: - log.debug(f"Version stats found: {version_stats}") + + log.debug(f"Version stats found: {version_stats}") current_release_date = version_stats.client.current.release_date latest_release_date = ( version_stats.client.latest.release_date @@ -325,50 +324,54 @@ def maybe_print_upgrade_message( # noqa: C901 if client_server_compat < 0: with contextlib.suppress(Exception): assert version_stats - print( - colored("❗Client-Server Incompatible❗", "yellow"), - colored( + click.echo( + click.style("❗Client-Server Incompatible❗", fg="yellow") + + " " + + click.style( f"Your client version {version_stats.client.current.version} is newer than your server version {version_stats.server.current.version}. Downgrading the cli to {version_stats.server.current.version} is recommended.\n", - "cyan", - ), - colored( + fg="cyan", + ) + + click.style( f"➡️ Downgrade via `\"pip install 'acryl-datahub=={version_stats.server.current.version}'\"", - "cyan", - ), + fg="cyan", + ) ) elif client_server_compat > 0: with contextlib.suppress(Exception): assert version_stats - print( - colored("❗Client-Server Incompatible❗", "red"), - colored( + click.echo( + click.style("❗Client-Server Incompatible❗", fg="red") + + " " + + click.style( f"Your client version {version_stats.client.current.version} is older than your server version {version_stats.server.current.version}. Upgrading the cli to {version_stats.server.current.version} is recommended.\n", - "cyan", - ), - colored( + fg="cyan", + ) + + click.style( f"➡️ Upgrade via \"pip install 'acryl-datahub=={version_stats.server.current.version}'\"", - "cyan", - ), + fg="cyan", + ) ) elif client_server_compat == 0 and encourage_cli_upgrade: with contextlib.suppress(Exception): - print( - colored("💡 Upgrade cli!", "yellow"), - colored( + click.echo( + click.style("💡 Upgrade cli!", fg="yellow") + + " " + + click.style( f"You seem to be running an old version of datahub cli: {current_version} {get_days(current_release_date)}. Latest version is {latest_version} {get_days(latest_release_date)}.\nUpgrade via \"pip install -U 'acryl-datahub'\"", - "cyan", - ), + fg="cyan", + ) ) elif encourage_quickstart_upgrade: try: assert version_stats - print( - colored("💡 Upgrade available!", "yellow"), - colored( + click.echo( + click.style("💡 Upgrade available!", fg="yellow") + + " " + + click.style( f'You seem to be running a slightly old quickstart image {get_days(version_stats.server.current.release_date)}. Run "datahub docker quickstart" to get the latest updates without losing any data!', - "cyan", + fg="cyan", ), - file=sys.stderr, + err=True, ) except Exception as e: log.debug(f"Failed to suggest quickstart upgrade due to {e}") From 91fbd12f84a36c0f2db652f64e37b164a6df5b2b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 29 Oct 2024 09:49:37 -0700 Subject: [PATCH 09/11] fix(ingest/unity): remove redundant check (#11732) --- metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index bd987c2da7c764..11827bace4b5a1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -462,8 +462,6 @@ def _create_table( datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc) if obj.updated_at else None - if obj.updated_at - else None ), updated_by=obj.updated_by, table_id=obj.table_id, From 6316e10d4815c039118bbe4703565d0ff75c5089 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 29 Oct 2024 09:50:37 -0700 Subject: [PATCH 10/11] feat(ingest): check ordering in SqlParsingAggregator tests (#11735) --- .../sql_parsing/sql_parsing_aggregator.py | 4 +- .../tests/test_helpers/mce_helpers.py | 4 + .../test_add_known_query_lineage.json | 40 ++-- .../test_column_lineage_deduplication.json | 4 +- .../aggregator_goldens/test_table_rename.json | 168 +++++++------- .../test_table_rename_with_temp.json | 4 +- .../aggregator_goldens/test_table_swap.json | 210 +++++++++--------- .../aggregator_goldens/test_temp_table.json | 4 +- .../unit/sql_parsing/test_sql_aggregator.py | 39 ++-- 9 files changed, 243 insertions(+), 234 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 255d4c6e6bb744..5c3a6b5b533a01 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -134,7 +134,7 @@ class QueryMetadata: upstreams: List[UrnStr] # this is direct upstreams, which may be temp tables column_lineage: List[ColumnLineageInfo] - column_usage: Dict[UrnStr, Set[UrnStr]] + column_usage: Dict[UrnStr, Set[UrnStr]] # TODO: Change to an OrderedSet confidence_score: float used_temp_tables: bool = True @@ -1426,7 +1426,7 @@ def _gen_query( for upstream in query.upstreams: query_subject_urns.add(upstream) if self.generate_query_subject_fields: - for column in query.column_usage.get(upstream, []): + for column in sorted(query.column_usage.get(upstream, [])): query_subject_urns.add( builder.make_schema_field_urn(upstream, column) ) diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py index 9ee4642bfe6eb3..3b59481d8cb022 100644 --- a/metadata-ingestion/tests/test_helpers/mce_helpers.py +++ b/metadata-ingestion/tests/test_helpers/mce_helpers.py @@ -82,6 +82,7 @@ def check_golden_file( golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), ignore_paths_v2: Sequence[str] = (), + ignore_order: bool = True, ) -> None: update_golden = pytestconfig.getoption("--update-golden-files") copy_output = pytestconfig.getoption("--copy-output-files") @@ -92,6 +93,7 @@ def check_golden_file( copy_output=copy_output, ignore_paths=ignore_paths, ignore_paths_v2=ignore_paths_v2, + ignore_order=ignore_order, ) @@ -100,6 +102,7 @@ def check_goldens_stream( outputs: List, golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), + ignore_order: bool = True, ) -> None: with tempfile.NamedTemporaryFile() as f: write_metadata_file(pathlib.Path(f.name), outputs) @@ -109,6 +112,7 @@ def check_goldens_stream( output_path=f.name, golden_path=golden_path, ignore_paths=ignore_paths, + ignore_order=ignore_order, ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json index 94c8947dba9ff1..0d8822736c95eb 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json @@ -85,26 +85,6 @@ } } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)", - "changeType": "UPSERT", - "aspectName": "operation", - "aspect": { - "json": { - "timestampMillis": 1707182625000, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, - "operationType": "INSERT", - "customProperties": { - "query_urn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" - }, - "lastUpdatedTimestamp": 20000 - } - } -}, { "entityType": "query", "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f", @@ -142,5 +122,25 @@ "platform": "urn:li:dataPlatform:redshift" } } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)", + "changeType": "UPSERT", + "aspectName": "operation", + "aspect": { + "json": { + "timestampMillis": 1707182625000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "operationType": "INSERT", + "customProperties": { + "query_urn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" + }, + "lastUpdatedTimestamp": 20000 + } + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json index d3ec3843168188..290ee7091df491 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json @@ -164,10 +164,10 @@ "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),c)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),c)" }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index f6d781b356ee96..750b2c4a92fd0b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -227,59 +227,6 @@ } } }, -{ - "entityType": "query", - "entityUrn": "urn:li:query:234a2904c367a6cc02d76cf358cd86937ec9e14af03e5539b5edb0b6df5db3dc", - "changeType": "UPSERT", - "aspectName": "queryProperties", - "aspect": { - "json": { - "statement": { - "value": "CREATE TABLE foo_staging AS\nSELECT\n a,\n b\nFROM foo_dep", - "language": "SQL" - }, - "source": "SYSTEM", - "created": { - "time": 0, - "actor": "urn:li:corpuser:_ingestion" - }, - "lastModified": { - "time": 1707182625000, - "actor": "urn:li:corpuser:_ingestion" - } - } - } -}, -{ - "entityType": "query", - "entityUrn": "urn:li:query:234a2904c367a6cc02d76cf358cd86937ec9e14af03e5539b5edb0b6df5db3dc", - "changeType": "UPSERT", - "aspectName": "querySubjects", - "aspect": { - "json": { - "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),b)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),a)" - }, - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),a)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),b)" - } - ] - } - } -}, { "entityType": "query", "entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", @@ -291,17 +238,6 @@ } } }, -{ - "entityType": "query", - "entityUrn": "urn:li:query:234a2904c367a6cc02d76cf358cd86937ec9e14af03e5539b5edb0b6df5db3dc", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:redshift" - } - } -}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD)", @@ -353,6 +289,70 @@ } } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:e4b3b60ab99e0f0bc1629ea82a5d7705a30dbd98a3923d599b39fb68624ea58d", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE TABLE foo_downstream AS\nSELECT\n a,\n b\nFROM foo_staging", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + } + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:e4b3b60ab99e0f0bc1629ea82a5d7705a30dbd98a3923d599b39fb68624ea58d", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),b)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD),b)" + } + ] + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:e4b3b60ab99e0f0bc1629ea82a5d7705a30dbd98a3923d599b39fb68624ea58d", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:redshift" + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)", @@ -406,13 +406,13 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:e4b3b60ab99e0f0bc1629ea82a5d7705a30dbd98a3923d599b39fb68624ea58d", + "entityUrn": "urn:li:query:234a2904c367a6cc02d76cf358cd86937ec9e14af03e5539b5edb0b6df5db3dc", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { "json": { "statement": { - "value": "CREATE TABLE foo_downstream AS\nSELECT\n a,\n b\nFROM foo_staging", + "value": "CREATE TABLE foo_staging AS\nSELECT\n a,\n b\nFROM foo_dep", "language": "SQL" }, "source": "SYSTEM", @@ -429,43 +429,43 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:e4b3b60ab99e0f0bc1629ea82a5d7705a30dbd98a3923d599b39fb68624ea58d", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:redshift" - } - } -}, -{ - "entityType": "query", - "entityUrn": "urn:li:query:e4b3b60ab99e0f0bc1629ea82a5d7705a30dbd98a3923d599b39fb68624ea58d", + "entityUrn": "urn:li:query:234a2904c367a6cc02d76cf358cd86937ec9e14af03e5539b5edb0b6df5db3dc", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { "json": { "subjects": [ { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),b)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),a)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),a)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),b)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD),a)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),a)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_downstream,PROD),b)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD),b)" } ] } } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:234a2904c367a6cc02d76cf358cd86937ec9e14af03e5539b5edb0b6df5db3dc", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:redshift" + } + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json index abae5da02135d7..a4ac349c3c455c 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json @@ -85,10 +85,10 @@ "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),b)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),a)" }, { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),a)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),b)" }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json index 1992bced039be0..171a1bd3753e24 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json @@ -177,24 +177,6 @@ } } }, -{ - "entityType": "query", - "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", - "changeType": "UPSERT", - "aspectName": "querySubjects", - "aspect": { - "json": { - "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" - }, - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_backup,PROD)" - } - ] - } - } -}, { "entityType": "query", "entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", @@ -220,24 +202,28 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", + "entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", "changeType": "UPSERT", - "aspectName": "queryProperties", + "aspectName": "querySubjects", "aspect": { "json": { - "statement": { - "value": "CREATE TABLE person_info_backup AS\nSELECT\n *\nFROM person_info_swap", - "language": "SQL" - }, - "source": "SYSTEM", - "created": { - "time": 0, - "actor": "urn:li:corpuser:_ingestion" - }, - "lastModified": { - "time": 1707182625000, - "actor": "urn:li:corpuser:_ingestion" - } + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)" + } + ] } } }, @@ -279,7 +265,30 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", + "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE TABLE person_info_backup AS\nSELECT\n *\nFROM person_info_swap", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + } + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -289,16 +298,7 @@ "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)" - }, - { - "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_backup,PROD)" } ] } @@ -342,17 +342,40 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", + "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + } + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { "json": { "subjects": [ { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_incremental,PROD)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_incremental,PROD)" } ] } @@ -362,22 +385,10 @@ "entityType": "query", "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", "changeType": "UPSERT", - "aspectName": "queryProperties", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "statement": { - "value": "CREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep", - "language": "SQL" - }, - "source": "SYSTEM", - "created": { - "time": 0, - "actor": "urn:li:corpuser:_ingestion" - }, - "lastModified": { - "time": 1707182625000, - "actor": "urn:li:corpuser:_ingestion" - } + "platform": "urn:li:dataPlatform:snowflake" } } }, @@ -421,31 +432,13 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", - "changeType": "UPSERT", - "aspectName": "querySubjects", - "aspect": { - "json": { - "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)" - }, - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_incremental,PROD)" - } - ] - } - } -}, -{ - "entityType": "query", - "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", + "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { "json": { "statement": { - "value": "INSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental", + "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap", "language": "SQL" }, "source": "SYSTEM", @@ -462,7 +455,25 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", + "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" + } + ] + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -473,13 +484,13 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", + "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap", + "value": "INSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental", "language": "SQL" }, "source": "SYSTEM", @@ -498,16 +509,23 @@ "entityType": "query", "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "querySubjects", "aspect": { "json": { - "platform": "urn:li:dataPlatform:snowflake" + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_incremental,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" + } + ] } } }, { "entityType": "query", - "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", + "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -515,23 +533,5 @@ "platform": "urn:li:dataPlatform:snowflake" } } -}, -{ - "entityType": "query", - "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", - "changeType": "UPSERT", - "aspectName": "querySubjects", - "aspect": { - "json": { - "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)" - }, - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)" - } - ] - } - } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json index b348785d064314..bcd31b0aa02490 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json @@ -281,10 +281,10 @@ "json": { "subjects": [ { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session3,PROD)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session3,PROD)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index eb64efd6693def..b1ad9eb5c15d76 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -1,3 +1,4 @@ +import functools import os import pathlib from datetime import datetime, timezone @@ -31,6 +32,10 @@ RESOURCE_DIR = pathlib.Path(__file__).parent / "aggregator_goldens" FROZEN_TIME = "2024-02-06T01:23:45Z" +check_goldens_stream = functools.partial( + mce_helpers.check_goldens_stream, ignore_order=False +) + def _ts(ts: int) -> datetime: return datetime.fromtimestamp(ts, tz=timezone.utc) @@ -56,7 +61,7 @@ def test_basic_lineage(pytestconfig: pytest.Config, tmp_path: pathlib.Path) -> N mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_basic_lineage.json", @@ -108,7 +113,7 @@ def test_overlapping_inserts(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_overlapping_inserts.json", @@ -167,7 +172,7 @@ def test_temp_table(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_temp_table.json", @@ -229,7 +234,7 @@ def test_multistep_temp_table(pytestconfig: pytest.Config) -> None: ) == 4 ) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_multistep_temp_table.json", @@ -305,7 +310,7 @@ def test_overlapping_inserts_from_temp_tables(pytestconfig: pytest.Config) -> No assert len(report.queries_with_non_authoritative_session) == 1 mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_overlapping_inserts_from_temp_tables.json", @@ -354,7 +359,7 @@ def test_aggregate_operations(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_aggregate_operations.json", @@ -392,7 +397,7 @@ def test_view_lineage(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_view_lineage.json", @@ -423,7 +428,7 @@ def test_known_lineage_mapping(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_known_lineage_mapping.json", @@ -461,7 +466,7 @@ def test_column_lineage_deduplication(pytestconfig: pytest.Config) -> None: # not get any credit for a and b, as they are already covered by query 2, # which came later and hence has higher precedence. - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_column_lineage_deduplication.json", @@ -506,7 +511,7 @@ def test_add_known_query_lineage(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_add_known_query_lineage.json", @@ -564,7 +569,7 @@ def test_table_rename(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_table_rename.json", @@ -624,7 +629,7 @@ def test_table_rename_with_temp(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_table_rename_with_temp.json", @@ -711,7 +716,7 @@ def test_table_swap(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_table_swap.json", @@ -881,7 +886,7 @@ def test_table_swap_with_temp(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_table_swap_with_temp.json", @@ -908,7 +913,7 @@ def test_create_table_query_mcps(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_create_table_query_mcps.json", @@ -943,7 +948,7 @@ def test_table_lineage_via_temp_table_disordered_add( mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR @@ -993,7 +998,7 @@ def test_basic_usage(pytestconfig: pytest.Config) -> None: mcps = list(aggregator.gen_metadata()) - mce_helpers.check_goldens_stream( + check_goldens_stream( pytestconfig, outputs=mcps, golden_path=RESOURCE_DIR / "test_basic_usage.json", From 55e3d1d977e51b9225967c73860ba75fe55cbb7a Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Wed, 30 Oct 2024 04:56:12 +0900 Subject: [PATCH 11/11] feat(docs-website): init solution pages (#11533) Co-authored-by: jayacryl <159848059+jayacryl@users.noreply.github.com> --- docs-website/docusaurus.config.js | 21 +- .../src/pages/_components/Community/index.js | 2 +- .../pages/_components/Hero/hero.module.scss | 3 + .../src/pages/_components/Hero/index.js | 4 +- .../_components/QuickstartContent/index.js | 2 +- .../QuickstartContent/quickstartContent.js | 49 +-- .../quickstartcontent.module.scss | 2 +- .../src/pages/_components/Trial/index.js | 2 +- .../src/pages/cloud/FeatureCards/index.js | 6 +- .../CaseStudy/case-study.module.scss | 176 +++++++++ .../solutions/_components/CaseStudy/index.js | 34 ++ .../_components/Hero/hero.module.scss | 181 +++++++++ .../pages/solutions/_components/Hero/index.js | 45 +++ .../_components/Integrations/index.js | 54 +++ .../Integrations/integrations.module.scss | 127 +++++++ .../_components/IntegrationsStatic/index.js | 34 ++ .../integrations.module.scss | 107 ++++++ .../solutions/_components/Persona/index.js | 46 +++ .../_components/Persona/styles.module.scss | 163 +++++++++ .../_components/QuickstartContent/index.js | 63 ++++ .../quickstartcontent.module.scss | 118 ++++++ .../_components/SlidingTabs/index.js | 69 ++++ .../SlidingTabs/styles.module.scss | 171 +++++++++ .../_components/Testimonials/index.js | 29 ++ .../Testimonials/styles.module.scss | 99 +++++ .../solutions/_components/Tiles/index.js | 80 ++++ .../_components/Tiles/tiles.module.scss | 203 +++++++++++ .../solutions/_components/Trials/index.js | 79 ++++ .../_components/Trials/styles.module.scss | 259 +++++++++++++ .../_components/UnifiedTabs/index.js | 42 +++ .../UnifiedTabs/styles.module.scss | 124 +++++++ .../_content/discoveryCaseStudyContent.js | 26 ++ .../_content/discoveryHeroContent.js | 8 + .../_content/discoveryQuickstartContent.js | 16 + .../_content/discoveryTestimonialsContent.js | 10 + .../_content/discoveryTilesContent.js | 67 ++++ .../_content/discoveryTrialsContent.js | 24 ++ .../src/pages/solutions/discovery/index.js | 62 ++++ .../solutions/discovery/styles.module.scss | 21 ++ .../_content/governanceCaseStudyContent.js | 26 ++ .../_content/governanceHeroContent.js | 8 + .../_content/governancePersonaContent.js | 31 ++ .../_content/governanceQuickstartContent.js | 12 + .../_content/governanceTestimonialsContent.js | 10 + .../_content/governanceTilesContent.js | 51 +++ .../_content/governanceTrialsContent.js | 27 ++ .../src/pages/solutions/governance/index.js | 66 ++++ .../solutions/governance/styles.module.scss | 21 ++ .../_content/observeCaseStudyContent.js | 26 ++ .../_content/observeHeroContent.js | 8 + .../_content/observeQuickstartContent.js | 16 + .../_content/observeResourceContent.js | 61 ++++ .../_content/observeTestimonialsContent.js | 10 + .../_content/observeTilesContent.js | 77 ++++ .../_content/observeTrialsContent.js | 24 ++ .../_content/observeUnifiedTabsContent.js | 22 ++ .../pages/solutions/observability/index.js | 121 +++++++ .../observability/styles.module.scss | 342 ++++++++++++++++++ .../img/solutions/case-study-card-bg.png | Bin 0 -> 35189 bytes .../static/img/solutions/checkout.jpg | Bin 0 -> 9215 bytes .../static/img/solutions/communities.png | Bin 0 -> 1038 bytes .../img/solutions/discovery-icons-group.png | Bin 0 -> 10360 bytes .../static/img/solutions/discovery-tile-1.png | Bin 0 -> 81971 bytes .../static/img/solutions/discovery-tile-2.png | Bin 0 -> 71238 bytes .../static/img/solutions/discovery-tile-3.png | Bin 0 -> 100125 bytes .../static/img/solutions/discovery-tile-4.png | Bin 0 -> 89074 bytes .../static/img/solutions/discovery-tile-5.png | Bin 0 -> 52258 bytes .../static/img/solutions/discovery-tile-6.png | Bin 0 -> 115754 bytes .../static/img/solutions/discovery-tile-7.png | Bin 0 -> 76235 bytes .../static/img/solutions/discovery-tile-8.png | Bin 0 -> 76449 bytes .../static/img/solutions/discovery-tile-9.png | Bin 0 -> 57895 bytes .../img/solutions/governance-icons-group.png | Bin 0 -> 8120 bytes .../img/solutions/governance-tile-1.png | Bin 0 -> 57833 bytes .../img/solutions/governance-tile-2.png | Bin 0 -> 215534 bytes .../img/solutions/governance-tile-3.png | Bin 0 -> 40833 bytes .../img/solutions/governance-tile-4.png | Bin 0 -> 41970 bytes .../img/solutions/governance-tile-5.png | Bin 0 -> 48883 bytes .../img/solutions/governance-tile-6.png | Bin 0 -> 45595 bytes .../img/solutions/governance-tile-7.png | Bin 0 -> 23796 bytes .../static/img/solutions/hero-background.png | Bin 0 -> 1211394 bytes .../static/img/solutions/hero-discovery.png | Bin 0 -> 141868 bytes .../static/img/solutions/hero-governance.png | Bin 0 -> 140875 bytes .../static/img/solutions/hero-observe.png | Bin 0 -> 148466 bytes .../static/img/solutions/icon-calendar.png | Bin 0 -> 1506 bytes .../static/img/solutions/icon-cloud.png | Bin 0 -> 1999 bytes .../static/img/solutions/icon-metric.png | Bin 0 -> 1652 bytes .../static/img/solutions/icon-migration.png | Bin 0 -> 2544 bytes .../static/img/solutions/icon-revenue.png | Bin 0 -> 1651 bytes .../static/img/solutions/icon-wrench.png | Bin 0 -> 2199 bytes .../logo-integration-1.png | Bin 0 -> 2583 bytes .../logo-integration-2.png | Bin 0 -> 2565 bytes .../logo-integration-3.png | Bin 0 -> 2505 bytes .../logo-integration-4.png | Bin 0 -> 1809 bytes .../logo-integration-5.png | Bin 0 -> 2329 bytes .../logo-integration-6.png | Bin 0 -> 1415 bytes .../integrations/logo-integration-1.png | Bin 0 -> 2565 bytes .../integrations/logo-integration-10.png | Bin 0 -> 2329 bytes .../integrations/logo-integration-11.png | Bin 0 -> 1415 bytes .../integrations/logo-integration-2.png | Bin 0 -> 2505 bytes .../integrations/logo-integration-3.png | Bin 0 -> 1809 bytes .../integrations/logo-integration-4.png | Bin 0 -> 2583 bytes .../integrations/logo-integration-5.png | Bin 0 -> 2329 bytes .../integrations/logo-integration-6.png | Bin 0 -> 1415 bytes .../integrations/logo-integration-7.png | Bin 0 -> 5380 bytes .../integrations/logo-integration-8.png | Bin 0 -> 2122 bytes .../integrations/logo-integration-9.png | Bin 0 -> 2350 bytes docs-website/static/img/solutions/lock.png | Bin 0 -> 650 bytes .../static/img/solutions/logo-depop.png | Bin 0 -> 10580 bytes .../static/img/solutions/logo-dpg-media.png | Bin 0 -> 3036 bytes .../static/img/solutions/logo-myob.png | Bin 0 -> 3371 bytes .../static/img/solutions/logo-notion.png | Bin 0 -> 3684 bytes docs-website/static/img/solutions/miro.png | Bin 0 -> 59741 bytes .../img/solutions/observe-icons-group.png | Bin 0 -> 10381 bytes .../static/img/solutions/observe-tile-1.png | Bin 0 -> 109201 bytes .../static/img/solutions/observe-tile-10.png | Bin 0 -> 29301 bytes .../static/img/solutions/observe-tile-11.png | Bin 0 -> 40331 bytes .../static/img/solutions/observe-tile-2.png | Bin 0 -> 439262 bytes .../static/img/solutions/observe-tile-3.png | Bin 0 -> 346834 bytes .../static/img/solutions/observe-tile-4.png | Bin 0 -> 92044 bytes .../static/img/solutions/observe-tile-5.png | Bin 0 -> 125545 bytes .../static/img/solutions/observe-tile-6.png | Bin 0 -> 93459 bytes .../static/img/solutions/observe-tile-7.png | Bin 0 -> 84272 bytes .../static/img/solutions/observe-tile-8.png | Bin 0 -> 51472 bytes .../static/img/solutions/observe-tile-9.png | Bin 0 -> 34695 bytes docs-website/static/img/solutions/optum.jpg | Bin 0 -> 14585 bytes .../solutions/persona_compliance_officers.png | Bin 0 -> 26357 bytes .../img/solutions/persona_developers.png | Bin 0 -> 30635 bytes .../img/solutions/persona_owners_and_smes.png | Bin 0 -> 29764 bytes .../static/img/solutions/personas-mobile.png | Bin 0 -> 36243 bytes .../static/img/solutions/rocket-launch.png | Bin 0 -> 1135 bytes .../static/img/solutions/sliding-tab-bg.png | Bin 0 -> 184843 bytes .../static/img/solutions/trial-icon-alert.svg | 6 + .../img/solutions/trial-icon-language.png | Bin 0 -> 3669 bytes .../img/solutions/trial-icon-lightening.png | Bin 0 -> 3604 bytes .../static/img/solutions/trial-icon-link.svg | 6 + .../static/img/solutions/trial-icon-lock.svg | 6 + .../static/img/solutions/trial-icon-owner.png | Bin 0 -> 5317 bytes .../img/solutions/trial-icon-standard.png | Bin 0 -> 3651 bytes .../static/img/solutions/trial-icon-star.svg | 6 + .../img/solutions/unified-tab-detect.png | Bin 0 -> 165544 bytes .../img/solutions/unified-tab-resolve.png | Bin 0 -> 164268 bytes .../img/solutions/unified-tab-unify.png | Bin 0 -> 123372 bytes .../static/img/solutions/water-lock.png | Bin 0 -> 801 bytes docs-website/static/img/solutions/wolt.png | Bin 0 -> 4492 bytes 144 files changed, 3577 insertions(+), 34 deletions(-) create mode 100644 docs-website/src/pages/solutions/_components/CaseStudy/case-study.module.scss create mode 100644 docs-website/src/pages/solutions/_components/CaseStudy/index.js create mode 100644 docs-website/src/pages/solutions/_components/Hero/hero.module.scss create mode 100644 docs-website/src/pages/solutions/_components/Hero/index.js create mode 100644 docs-website/src/pages/solutions/_components/Integrations/index.js create mode 100644 docs-website/src/pages/solutions/_components/Integrations/integrations.module.scss create mode 100644 docs-website/src/pages/solutions/_components/IntegrationsStatic/index.js create mode 100644 docs-website/src/pages/solutions/_components/IntegrationsStatic/integrations.module.scss create mode 100644 docs-website/src/pages/solutions/_components/Persona/index.js create mode 100644 docs-website/src/pages/solutions/_components/Persona/styles.module.scss create mode 100644 docs-website/src/pages/solutions/_components/QuickstartContent/index.js create mode 100644 docs-website/src/pages/solutions/_components/QuickstartContent/quickstartcontent.module.scss create mode 100644 docs-website/src/pages/solutions/_components/SlidingTabs/index.js create mode 100644 docs-website/src/pages/solutions/_components/SlidingTabs/styles.module.scss create mode 100644 docs-website/src/pages/solutions/_components/Testimonials/index.js create mode 100644 docs-website/src/pages/solutions/_components/Testimonials/styles.module.scss create mode 100644 docs-website/src/pages/solutions/_components/Tiles/index.js create mode 100644 docs-website/src/pages/solutions/_components/Tiles/tiles.module.scss create mode 100644 docs-website/src/pages/solutions/_components/Trials/index.js create mode 100644 docs-website/src/pages/solutions/_components/Trials/styles.module.scss create mode 100644 docs-website/src/pages/solutions/_components/UnifiedTabs/index.js create mode 100644 docs-website/src/pages/solutions/_components/UnifiedTabs/styles.module.scss create mode 100644 docs-website/src/pages/solutions/discovery/_content/discoveryCaseStudyContent.js create mode 100644 docs-website/src/pages/solutions/discovery/_content/discoveryHeroContent.js create mode 100644 docs-website/src/pages/solutions/discovery/_content/discoveryQuickstartContent.js create mode 100644 docs-website/src/pages/solutions/discovery/_content/discoveryTestimonialsContent.js create mode 100644 docs-website/src/pages/solutions/discovery/_content/discoveryTilesContent.js create mode 100644 docs-website/src/pages/solutions/discovery/_content/discoveryTrialsContent.js create mode 100644 docs-website/src/pages/solutions/discovery/index.js create mode 100644 docs-website/src/pages/solutions/discovery/styles.module.scss create mode 100644 docs-website/src/pages/solutions/governance/_content/governanceCaseStudyContent.js create mode 100644 docs-website/src/pages/solutions/governance/_content/governanceHeroContent.js create mode 100644 docs-website/src/pages/solutions/governance/_content/governancePersonaContent.js create mode 100644 docs-website/src/pages/solutions/governance/_content/governanceQuickstartContent.js create mode 100644 docs-website/src/pages/solutions/governance/_content/governanceTestimonialsContent.js create mode 100644 docs-website/src/pages/solutions/governance/_content/governanceTilesContent.js create mode 100644 docs-website/src/pages/solutions/governance/_content/governanceTrialsContent.js create mode 100644 docs-website/src/pages/solutions/governance/index.js create mode 100644 docs-website/src/pages/solutions/governance/styles.module.scss create mode 100644 docs-website/src/pages/solutions/observability/_content/observeCaseStudyContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeHeroContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeQuickstartContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeResourceContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeTestimonialsContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeTilesContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeTrialsContent.js create mode 100644 docs-website/src/pages/solutions/observability/_content/observeUnifiedTabsContent.js create mode 100644 docs-website/src/pages/solutions/observability/index.js create mode 100644 docs-website/src/pages/solutions/observability/styles.module.scss create mode 100644 docs-website/static/img/solutions/case-study-card-bg.png create mode 100644 docs-website/static/img/solutions/checkout.jpg create mode 100644 docs-website/static/img/solutions/communities.png create mode 100644 docs-website/static/img/solutions/discovery-icons-group.png create mode 100644 docs-website/static/img/solutions/discovery-tile-1.png create mode 100644 docs-website/static/img/solutions/discovery-tile-2.png create mode 100644 docs-website/static/img/solutions/discovery-tile-3.png create mode 100644 docs-website/static/img/solutions/discovery-tile-4.png create mode 100644 docs-website/static/img/solutions/discovery-tile-5.png create mode 100644 docs-website/static/img/solutions/discovery-tile-6.png create mode 100644 docs-website/static/img/solutions/discovery-tile-7.png create mode 100644 docs-website/static/img/solutions/discovery-tile-8.png create mode 100644 docs-website/static/img/solutions/discovery-tile-9.png create mode 100644 docs-website/static/img/solutions/governance-icons-group.png create mode 100644 docs-website/static/img/solutions/governance-tile-1.png create mode 100644 docs-website/static/img/solutions/governance-tile-2.png create mode 100644 docs-website/static/img/solutions/governance-tile-3.png create mode 100644 docs-website/static/img/solutions/governance-tile-4.png create mode 100644 docs-website/static/img/solutions/governance-tile-5.png create mode 100644 docs-website/static/img/solutions/governance-tile-6.png create mode 100644 docs-website/static/img/solutions/governance-tile-7.png create mode 100644 docs-website/static/img/solutions/hero-background.png create mode 100644 docs-website/static/img/solutions/hero-discovery.png create mode 100644 docs-website/static/img/solutions/hero-governance.png create mode 100644 docs-website/static/img/solutions/hero-observe.png create mode 100644 docs-website/static/img/solutions/icon-calendar.png create mode 100644 docs-website/static/img/solutions/icon-cloud.png create mode 100644 docs-website/static/img/solutions/icon-metric.png create mode 100644 docs-website/static/img/solutions/icon-migration.png create mode 100644 docs-website/static/img/solutions/icon-revenue.png create mode 100644 docs-website/static/img/solutions/icon-wrench.png create mode 100644 docs-website/static/img/solutions/integrations-observe/logo-integration-1.png create mode 100644 docs-website/static/img/solutions/integrations-observe/logo-integration-2.png create mode 100644 docs-website/static/img/solutions/integrations-observe/logo-integration-3.png create mode 100644 docs-website/static/img/solutions/integrations-observe/logo-integration-4.png create mode 100644 docs-website/static/img/solutions/integrations-observe/logo-integration-5.png create mode 100644 docs-website/static/img/solutions/integrations-observe/logo-integration-6.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-1.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-10.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-11.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-2.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-3.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-4.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-5.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-6.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-7.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-8.png create mode 100644 docs-website/static/img/solutions/integrations/logo-integration-9.png create mode 100644 docs-website/static/img/solutions/lock.png create mode 100644 docs-website/static/img/solutions/logo-depop.png create mode 100644 docs-website/static/img/solutions/logo-dpg-media.png create mode 100644 docs-website/static/img/solutions/logo-myob.png create mode 100644 docs-website/static/img/solutions/logo-notion.png create mode 100644 docs-website/static/img/solutions/miro.png create mode 100644 docs-website/static/img/solutions/observe-icons-group.png create mode 100644 docs-website/static/img/solutions/observe-tile-1.png create mode 100644 docs-website/static/img/solutions/observe-tile-10.png create mode 100644 docs-website/static/img/solutions/observe-tile-11.png create mode 100644 docs-website/static/img/solutions/observe-tile-2.png create mode 100644 docs-website/static/img/solutions/observe-tile-3.png create mode 100644 docs-website/static/img/solutions/observe-tile-4.png create mode 100644 docs-website/static/img/solutions/observe-tile-5.png create mode 100644 docs-website/static/img/solutions/observe-tile-6.png create mode 100644 docs-website/static/img/solutions/observe-tile-7.png create mode 100644 docs-website/static/img/solutions/observe-tile-8.png create mode 100644 docs-website/static/img/solutions/observe-tile-9.png create mode 100644 docs-website/static/img/solutions/optum.jpg create mode 100644 docs-website/static/img/solutions/persona_compliance_officers.png create mode 100644 docs-website/static/img/solutions/persona_developers.png create mode 100644 docs-website/static/img/solutions/persona_owners_and_smes.png create mode 100644 docs-website/static/img/solutions/personas-mobile.png create mode 100644 docs-website/static/img/solutions/rocket-launch.png create mode 100644 docs-website/static/img/solutions/sliding-tab-bg.png create mode 100644 docs-website/static/img/solutions/trial-icon-alert.svg create mode 100644 docs-website/static/img/solutions/trial-icon-language.png create mode 100644 docs-website/static/img/solutions/trial-icon-lightening.png create mode 100644 docs-website/static/img/solutions/trial-icon-link.svg create mode 100644 docs-website/static/img/solutions/trial-icon-lock.svg create mode 100644 docs-website/static/img/solutions/trial-icon-owner.png create mode 100644 docs-website/static/img/solutions/trial-icon-standard.png create mode 100644 docs-website/static/img/solutions/trial-icon-star.svg create mode 100644 docs-website/static/img/solutions/unified-tab-detect.png create mode 100644 docs-website/static/img/solutions/unified-tab-resolve.png create mode 100644 docs-website/static/img/solutions/unified-tab-unify.png create mode 100644 docs-website/static/img/solutions/water-lock.png create mode 100644 docs-website/static/img/solutions/wolt.png diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 066135b85c0add..8c44b3c8b40bfa 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -88,7 +88,26 @@ module.exports = { }, items: [ { - to: "cloud/", + type: "dropdown", + label: "Solutions", + position: "right", + items: [ + { + to: "/solutions/discovery", + label: "Discovery", + }, + { + to: "/solutions/observability", + label: "Observability", + }, + { + to: "/solutions/governance", + label: "Governance", + }, + ] + }, + { + to: "/cloud", activeBasePath: "cloud", label: "Cloud", position: "right", diff --git a/docs-website/src/pages/_components/Community/index.js b/docs-website/src/pages/_components/Community/index.js index a4f2b2304e51e4..20917332c443d2 100644 --- a/docs-website/src/pages/_components/Community/index.js +++ b/docs-website/src/pages/_components/Community/index.js @@ -2,7 +2,7 @@ import React, { useState, useRef, useEffect } from "react"; import styles from "./community.module.scss"; import useBaseUrl from "@docusaurus/useBaseUrl"; -const TARGET_COUNT = 11535; +const TARGET_COUNT = 12219; const INCREMENT = 1; const Community = () => { diff --git a/docs-website/src/pages/_components/Hero/hero.module.scss b/docs-website/src/pages/_components/Hero/hero.module.scss index c7f4ec6a0b78f9..1850757bd454c2 100644 --- a/docs-website/src/pages/_components/Hero/hero.module.scss +++ b/docs-website/src/pages/_components/Hero/hero.module.scss @@ -233,6 +233,9 @@ } .hero__cta { margin-top: 12px; + .cta__primary { + margin-bottom: 4px; + } } .hero__footer_cta { margin-top: 12px; diff --git a/docs-website/src/pages/_components/Hero/index.js b/docs-website/src/pages/_components/Hero/index.js index ca94203e94c063..a61b9d8a402bdf 100644 --- a/docs-website/src/pages/_components/Hero/index.js +++ b/docs-website/src/pages/_components/Hero/index.js @@ -69,7 +69,7 @@ const Hero = ({ onOpenTourModal }) => {
- Book a Demo + Get Cloud {
- Get started with Core → + Get started with Open Source →
diff --git a/docs-website/src/pages/_components/QuickstartContent/index.js b/docs-website/src/pages/_components/QuickstartContent/index.js index 302fe7f5e8382b..44410a3838e263 100644 --- a/docs-website/src/pages/_components/QuickstartContent/index.js +++ b/docs-website/src/pages/_components/QuickstartContent/index.js @@ -47,7 +47,7 @@ const QuickstartContent = ({}) => { >
The only platform you need.
- Unified Discovery, Observability, and Governance for Data and AI. + Unified discovery, observability, and governance for data and AI.
Eliminate breaking changes with detailed cross-platform and column-level lineage. Build confidence in your data with a comprehensive view of business, operational, and technical context, all in one place.", - image: "/img/quickstart_discovery.png", - }, - { - heading: "Observability", - title: "Build trust in your data", - description: - "Effortlessly detect data quality issues with automated checks and AI-driven anomaly detection. Notify your team where they work when issues arise and keep stakeholders in the loop with centralized incident tracking. Spend minutes, not days, resolving issues with detailed lineage, documentation, and ownership information all in one place.", - image: "/img/quickstart_observability.png", - }, - { - heading: "Governance", - title: "Minimize compliance risk, effortlessly", - description: - "Ensure every data asset is accounted for and responsibility governed by defining and enforcing documentation standards. Automate your governance program to automatically classify assets as they evolve over time. Minimize redundant, manual work with GenAI documentation, AI-driven classification, smart propagation, and more.", - image: "/img/quickstart_governance.png", - }, - ]; - - export default quickstartData; \ No newline at end of file + { + heading: "Discovery", + title: "Make data democratization a reality", + description: + "Enable everyone in your organization to effortlessly discover trustworthy data, with experiences tailored for each persona.Eliminate breaking changes with detailed cross-platform and column-level lineage. Build confidence in your data with a comprehensive view of business, operational, and technical context, all in one place.", + image: "/img/quickstart_discovery.png", + }, + { + heading: "Observability", + title: "Build trust in your data", + description: + "Effortlessly detect data quality issues with automated checks and AI-driven anomaly detection. Notify your team where they work when issues arise and keep stakeholders in the loop with centralized incident tracking. Spend minutes, not days, resolving issues with detailed lineage, documentation, and ownership information all in one place.", + image: "/img/quickstart_observability.png", + }, + { + heading: "Governance", + title: "Minimize compliance risk, effortlessly", + description: + "Ensure every data asset is accounted for and responsibility governed by defining and enforcing documentation standards. Automate your governance program to automatically classify assets as they evolve over time. Minimize redundant, manual work with GenAI documentation, AI-driven classification, smart propagation, and more.", + image: "/img/quickstart_governance.png", + }, +]; + +export default quickstartData; \ No newline at end of file diff --git a/docs-website/src/pages/_components/QuickstartContent/quickstartcontent.module.scss b/docs-website/src/pages/_components/QuickstartContent/quickstartcontent.module.scss index 57a3037efda199..2e0bc41af278f0 100644 --- a/docs-website/src/pages/_components/QuickstartContent/quickstartcontent.module.scss +++ b/docs-website/src/pages/_components/QuickstartContent/quickstartcontent.module.scss @@ -259,4 +259,4 @@ } } } -} +} \ No newline at end of file diff --git a/docs-website/src/pages/_components/Trial/index.js b/docs-website/src/pages/_components/Trial/index.js index 9fc71b50336b00..bd170f3d21e6b8 100644 --- a/docs-website/src/pages/_components/Trial/index.js +++ b/docs-website/src/pages/_components/Trial/index.js @@ -20,7 +20,7 @@ const Trial = ({onOpenTourModal}) => { onClick={onOpenTourModal} >Product Tour
- Get started with Core → + Get started with Open Source →
diff --git a/docs-website/src/pages/cloud/FeatureCards/index.js b/docs-website/src/pages/cloud/FeatureCards/index.js index 4a45cbcbe17174..52c7a5eec46b4b 100644 --- a/docs-website/src/pages/cloud/FeatureCards/index.js +++ b/docs-website/src/pages/cloud/FeatureCards/index.js @@ -9,7 +9,7 @@ const data = { { title: "Data Discovery", icon: "/img/assets/data-discovery.svg", - cloudPageLink: "https://www.acryldata.io/acryl-datahub", + cloudPageLink: "/solutions/discovery", cloudBenefits: [ { text: "Enhanced search ranking", link: "" }, // → { text: "Personalization for every persona", link: "" }, // → @@ -27,7 +27,7 @@ const data = { { title: "Data Observability", icon: "/img/assets/data-ob.svg", - cloudPageLink: "https://www.acryldata.io/observe", + cloudPageLink: "/solutions/observability", cloudBenefits: [ { text: "Continuous data quality monitors", link: "" }, // → { text: "End-to-end data incident tracking & management", link: "" }, // → @@ -45,7 +45,7 @@ const data = { { title: "Data Governance", icon: "/img/assets/data-governance.svg", - cloudPageLink: "https://www.acryldata.io/acryl-datahub#governance", + cloudPageLink: "/solutions/governance", cloudBenefits: [ { text: "Human-assisted asset certification workflows", link: "" }, // → { text: "Automations to enforce governance standards", link: "" }, // → diff --git a/docs-website/src/pages/solutions/_components/CaseStudy/case-study.module.scss b/docs-website/src/pages/solutions/_components/CaseStudy/case-study.module.scss new file mode 100644 index 00000000000000..c41a3bde6a52af --- /dev/null +++ b/docs-website/src/pages/solutions/_components/CaseStudy/case-study.module.scss @@ -0,0 +1,176 @@ +.container { + display: flex; + flex-direction: column; + font-family: "Manrope"; +} +.case_study { + display: flex; + flex-direction: column; + width: 100vw; + margin: 5rem auto; + + .case_study_heading { + color: var(--primitives-text-text-heading, #373A47); + text-align: center; + font-family: Manrope; + font-size: 29.177px; + font-style: normal; + font-weight: 400; + line-height: normal; + letter-spacing: 0.292px; + } + + .card_row::-webkit-scrollbar { + display: none; + } + .card_row { + overflow-x: scroll; + width: 100vw; + scrollbar-width: none; + display: flex; + margin-top: 2rem; + + .card_row_wrapper { + display: flex; + flex-direction: row; + align-items: center; + margin: auto; + } + .cardLink { + color: #000; + + &:hover { + text-decoration: none; + } + } + + .card { + max-width: 419px; + max-height: 148px; + padding: 1.2rem; + margin: 1rem 0.5rem; + display: flex; + flex-direction: column; + align-items: center; + border-radius: var(--number-scales-2s-32, 32px); + border: 1px solid var(--semantics-bg-bg-white, #99999930); + background: var(--primitives-grays-1, #9999991a); + + transition: all .3s ease-in-out; + &:hover { + box-shadow: 0px 1px 4px 1px #0000001C; + border-color: #33333340; + background: var(--primitives-grays-1, #dddddd1a); + } + + .card_image { + border-radius: 16px; + display: flex; + justify-content: center; + align-items: center; + padding: 0; + margin: auto; + margin-right: 16px; + + } + + .card_heading_div { + width: 70%; + padding: 12px; + + .card_heading { + color: var(--primitives-grays-9, #484C5C); + font-family: Manrope; + font-size: 1.3rem; + font-style: normal; + font-weight: 400; + line-height: normal; + -webkit-line-clamp: 3; + -webkit-box-orient: vertical; + overflow: hidden; + display: -webkit-box; + + } + } + } + + } + + a:hover { + text-decoration: none !important; + } + + .bottom_line { + cursor: pointer; + text-decoration: none; + display: flex; + align-items: center; + justify-content: center; + color: #12b0fb; + font-size: 1.1rem; + font-weight: 500; + margin-top: 50px; + margin-bottom: 10px; + + span { + line-height: 10px; + font-size: 1.5rem; + margin-left: 10px; + } + } +} + +@media (max-width: 800px) { + .case_study { + .case_study_heading { + text-align: center; + font-family: "Manrope"; + width: 80%; + margin: auto; + font-size: 1.5rem; + line-height: normal; + font-weight: 400; + } + + .card_row { + margin-top: 16px; + display: flex; + justify-content: flex-start; + + .card_row_wrapper { + padding: 0 0; + width: 100vh; + align-items: flex-start; + justify-content: flex-start; + } + + .card { + min-width: 240px; + padding: 0.8rem; + margin: 0.5rem; + display: flex; + + .card_image { + width: 50px; + height: 50px; + margin: 16px auto; + } + + .card_heading_div { + text-align: left; + padding: 8px; + + .card_heading { + font-size: 1rem; + line-height: 1.2rem; + } + } + } + } + + .bottom_line { + font-size: 1rem; + margin-top: 40px; + } + } +} diff --git a/docs-website/src/pages/solutions/_components/CaseStudy/index.js b/docs-website/src/pages/solutions/_components/CaseStudy/index.js new file mode 100644 index 00000000000000..9ebffa006e570f --- /dev/null +++ b/docs-website/src/pages/solutions/_components/CaseStudy/index.js @@ -0,0 +1,34 @@ +import React from "react"; +import styles from "./case-study.module.scss"; +import clsx from "clsx"; +import Link from '@docusaurus/Link' + +const CaseStudy = ({ caseStudyContent }) => { + const { title, backgroundColor, items } = caseStudyContent; + return ( +
+
+
+ {title} +
+ +
+
+ {items.map((caseStudy) => ( + +
+ {caseStudy.alt} +
+
+
{caseStudy.title}
+
+ + ))} +
+
+
+
+ ); +}; + +export default CaseStudy; diff --git a/docs-website/src/pages/solutions/_components/Hero/hero.module.scss b/docs-website/src/pages/solutions/_components/Hero/hero.module.scss new file mode 100644 index 00000000000000..ef2d90404ed08c --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Hero/hero.module.scss @@ -0,0 +1,181 @@ +.hero__container { + display: flex; + flex-direction: column; + align-items: center; + gap: 16px; + background-color: #F9F9FA; + background-image: url(/img/solutions/hero-background.png); + background-position: bottom 0rem center; + background-size: contain; + background-repeat: no-repeat; + max-height: 890px; + min-height: 85vh; + border-radius: 48px; + margin: 3rem 3rem 12rem 3rem; + padding: 5rem; + + .hero__topQuote { + color: var(--primitives-text-tex-subtext, #777E99); + font-family: Manrope; + font-size: 1.2rem; + font-style: normal; + font-weight: 500; + line-height: normal; + letter-spacing: 0.2rem; + text-transform: uppercase; + margin-top: 5vh; + } + + .hero__title { + color: #373A47; + font-family: Manrope; + font-size: 4rem; + font-style: normal; + font-weight: 300; + line-height: 4.25rem; + max-width: 720px; + text-align: center; + } + + .hero__description { + color: var(--primitives-text-text-heading, #373A47); + text-align: center; + font-family: Manrope; + font-size: 1.25rem; + font-style: normal; + font-weight: 400; + line-height: 2.25rem; + max-width: 800px; + margin-top: 1rem; + } + + .cta__tertiary { + color: black; + font-size: 1rem; + margin-top: .5rem; + font-weight: 500; + } + + .cta__tertiary:hover { + color: black; + text-decoration: none; + opacity: .8; + } + + .hero__cta { + margin-top: 0rem; + display: flex; + + a { + cursor: pointer; + font-size: 1rem; + background-color: white; + padding: 4px 20px; + border-radius: 50px; + margin: 0 10px 0 0; + font-weight: 600; + text-decoration: none; + transition: background-color .2s ease-in-out; + &:hover { + opacity : 0.8; + } + } + + .cta__primary { + padding: var(--number-scales-2s-12, 8px) var(--number-scales-2s-24, 24px); + justify-content: center; + align-items: center; + gap: var(--number-scales-2s-8, 8px); + border-radius: var(--number-scales-2s-full, 999px); + background: var(--semantics-surface-default, #1890FF); + color: white; + } + + .cta__secondary { + padding: var(--number-scales-2s-12, 8px) var(--number-scales-2s-24, 24px); + justify-content: center; + align-items: center; + gap: var(--number-scales-2s-8, 8px); + border-radius: var(--number-scales-2s-full, 999px); + border: 1px solid var(--semantics-border-default, #1890FF); + background: var(--semantics-bg-bg-white, #FFF); + } + } + + .hero__img_container { + position: relative; + + .hero__img { + z-index: 10; + height: 40vh; + min-height: 320px; + margin-bottom: -8rem; + } + + .hero__img_gradient { + position: absolute; + right: 0; + left: 0; + bottom: 0; + margin-bottom: -8rem; + height: 60%; + width: min-content; + background: linear-gradient(to top, #FFFFFF 15.52%, transparent); + } + } + + // Mobile adjustments + @media (max-width: 768px) { + max-height: auto; + padding: 1.25rem; + margin: 1rem .5rem 6rem .5rem; + border-radius: 24px; + + .hero__topQuote { + font-size: 1rem; + margin-top: 3vh; + text-align: center; + } + + .hero__title { + font-size: 3rem; + line-height: 3.5rem; + max-width: 100%; + } + + .hero__description { + font-size: 1rem; + line-height: 1.75rem; + max-width: 100%; + } + + .hero__img_container { + .hero__img { + height: 30vh; + min-height: 200px; + margin-bottom: -6rem; + object-fit: contain; + } + } + + .cta__tertiary { + font-size: 0.9rem; + } + + .hero__cta { + a { + margin: 0.5rem; + padding: 8px 16px; + font-size: 0.9rem; + } + + .cta__primary { + background: #1890FF; + } + + .cta__secondary { + border: 1px solid #1890FF; + } + } + } +} diff --git a/docs-website/src/pages/solutions/_components/Hero/index.js b/docs-website/src/pages/solutions/_components/Hero/index.js new file mode 100644 index 00000000000000..fd50f2871b9eb4 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Hero/index.js @@ -0,0 +1,45 @@ +import React from 'react'; +import styles from './hero.module.scss'; +import Link from "@docusaurus/Link"; + +const Hero = ({ onOpenTourModal, heroContent }) => { + const { topQuote, title, description, imgSrc } = heroContent + return ( +
+
+
+ {topQuote} +
+
+ {title} +
+

{description}

+
+ + Get Cloud + + + Product Tour + +
+ + Start with Open Source → + +
+
+ DataHub Platform Preview +
+
+
+
+ ); +}; + +export default Hero; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/_components/Integrations/index.js b/docs-website/src/pages/solutions/_components/Integrations/index.js new file mode 100644 index 00000000000000..77f028eb4cf74a --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Integrations/index.js @@ -0,0 +1,54 @@ +import React, { useRef, useEffect } from "react"; +import styles from "./integrations.module.scss"; +import useBaseUrl from "@docusaurus/useBaseUrl"; + +const Integrations = () => { + const integrationsPath = 'img/solutions/integrations'; + const hasAnimatedRef = useRef(false); + const counterRef = useRef(null); + const handleScroll = () => { + if (hasAnimatedRef.current) return; + if (!counterRef.current) return; + + const { top } = counterRef.current.getBoundingClientRect(); + const windowHeight = window.innerHeight; + + if (top <= windowHeight) { + hasAnimatedRef.current = true; + animateNumber(); + } + }; + + useEffect(() => { + window.addEventListener('scroll', handleScroll); + return () => { + window.removeEventListener('scroll', handleScroll); + } + }, []) + + return ( +
+
+ Integrates with your data stack +
+
+
+
+
+ {[...Array(3)].map((_, i) => ( + + {[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11].map((item, index) => ( +
+
+ ))} +
+ ))} +
+
+
+
+
+ ); +}; + +export default Integrations; diff --git a/docs-website/src/pages/solutions/_components/Integrations/integrations.module.scss b/docs-website/src/pages/solutions/_components/Integrations/integrations.module.scss new file mode 100644 index 00000000000000..da0c6964e8775f --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Integrations/integrations.module.scss @@ -0,0 +1,127 @@ +.container { + display: flex; + flex-direction: column; + + .section_header { + color: var(--primitives-text-tex-subtext, #777E99); + text-align: center; + font-family: "Helvetica Neue"; + font-size: 1.5rem; + font-style: normal; + font-weight: 400; + line-height: normal; + letter-spacing: 0.5px; + + margin-bottom: 2rem; + position: relative; + padding: 0 2rem; + display: block; + width: 100%; + + &:before, &:after { + content: " "; + height: 1px; + width: calc((100vw - 48rem)/2); + background: #D9DBE4; + display: block; + position: absolute; + top: 50%; + } + + &:before { + left: 8rem; + } + + &:after { + right: 8rem; + } + } +} + + +.carouselContainer { + width: 100%; + min-height: 100px; + min-width: 400px; + overflow: hidden; +} +@media screen and (max-width: 800px) { + .carouselContainer { + min-width: auto; + } +} + +.slider { + height: 100px; + margin: 1rem auto; + overflow: hidden; + position: relative; + display: flex; + align-items: center; + width: 100%; + + &::before, + &::after { + position: absolute; + content: ''; + width: 90%; + height: 100%; + z-index: 9; + } + +} + +.slider { + position: relative; +} + +.slide_track { + display: flex; + width: max-content; + animation: scroll 30s linear infinite; +} + +.slide { + width: 100px; + height: 100px; + margin-left: 52px; + display: flex; + justify-content: space-between; + overflow: hidden; + flex-direction: column; + align-items: center; + position: relative; + background-size: cover; + background-repeat: no-repeat; +} + +@keyframes scroll { + 0% { + transform: translateX(0); + } + + 100% { + transform: translateX(-50%); + } +} + + +@keyframes slideIn { + 0% { + opacity: 0; + transform: translateY(20px); + } + + 100% { + opacity: 1; + transform: translateY(0); + } +} + +@media only screen and (max-width: 800px) { + .slide { + width: 80px; + height: 80px; + margin: auto 1rem; + } +} \ No newline at end of file diff --git a/docs-website/src/pages/solutions/_components/IntegrationsStatic/index.js b/docs-website/src/pages/solutions/_components/IntegrationsStatic/index.js new file mode 100644 index 00000000000000..76b99b156704e0 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/IntegrationsStatic/index.js @@ -0,0 +1,34 @@ +import React, { useRef, useEffect } from "react"; +import styles from "./integrations.module.scss"; +import useBaseUrl from "@docusaurus/useBaseUrl"; + +const Integrations = () => { + const integrationsPath = 'img/solutions/integrations-observe'; + + + return ( +
+
+ Integrates with your data stack +
+
+
+
+
+ {[...Array(1)].map((_, i) => ( + + {[1, 2, 3, 4, 5, 6].map((item, index) => ( +
+
+ ))} +
+ ))} +
+
+
+
+
+ ); +}; + +export default Integrations; diff --git a/docs-website/src/pages/solutions/_components/IntegrationsStatic/integrations.module.scss b/docs-website/src/pages/solutions/_components/IntegrationsStatic/integrations.module.scss new file mode 100644 index 00000000000000..aa2201fd0185c4 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/IntegrationsStatic/integrations.module.scss @@ -0,0 +1,107 @@ +.container { + display: flex; + flex-direction: column; + + .section_header { + color: var(--primitives-text-tex-subtext, #777E99); + text-align: center; + font-family: "Helvetica Neue"; + font-size: 1.5rem; + font-style: normal; + font-weight: 400; + line-height: normal; + letter-spacing: 0.5px; + + margin-bottom: 2rem; + position: relative; + padding: 0 2rem; + display: block; + width: 100%; + + &:before, &:after { + content: " "; + height: 1px; + width: calc((100vw - 65rem)/2); + background: #D9DBE4; + display: block; + position: absolute; + top: 50%; + } + + &:before { + left: 20rem; + } + + &:after { + right: 20rem; + } + } +} + + +.carouselContainer { + width: 100%; + min-height: 100px; + min-width: 400px; +} + +.slider { + height: 100px; + margin: 1rem auto; + overflow: hidden; + position: relative; + display: flex; + align-items: center; + width: 100%; + + &::before, + &::after { + position: absolute; + content: ''; + width: 90%; + height: 100%; + z-index: 9; + } + +} + +.carouselContainer { + overflow: hidden; +} + +.slider { + position: relative; +} + +.slide_track { + display: flex; + width: max-content; + margin: auto; +} + +.slide { + width: 100px; + height: 100px; + margin: auto 3rem; + display: flex; + justify-content: space-between; + overflow: hidden; + flex-direction: column; + align-items: center; + position: relative; + background-size: cover; + background-repeat: no-repeat; +} + + +@media only screen and (max-width: 800px) { + .slider { + max-width: 100vw; + min-width: auto; + } + .slide { + width: 80px; + height: 80px; + margin: auto 1rem; + } +} \ No newline at end of file diff --git a/docs-website/src/pages/solutions/_components/Persona/index.js b/docs-website/src/pages/solutions/_components/Persona/index.js new file mode 100644 index 00000000000000..d377de28525bd0 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Persona/index.js @@ -0,0 +1,46 @@ +import React from "react"; +import styles from "./styles.module.scss"; +import clsx from "clsx"; + +const Persona = ({ personaContent }) => { + const { title, personas } = personaContent; + + return ( +
+
+
{title}
+ +
+
+
+ {personas.map((persona, index) => ( +
+
+ {persona.alt} +
+
+ + + + +
+
+ ))} +
+
+
+ Persona +
+
+
+
+ ); +}; + +const FeatureItem = ({ text }) => ( +
+ {text} +
+); + +export default Persona; diff --git a/docs-website/src/pages/solutions/_components/Persona/styles.module.scss b/docs-website/src/pages/solutions/_components/Persona/styles.module.scss new file mode 100644 index 00000000000000..3568cc2b75611d --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Persona/styles.module.scss @@ -0,0 +1,163 @@ +.container { + padding: 5rem; + padding-bottom: 0; + display: flex; + position: relative; + justify-content: center; + align-items: center; + background: linear-gradient(30deg, #EBEBEB 15.52%, #FFF 85.84%); +} + +.personas { + width: 100%; +} + +.persona_heading { + color: var(--primitives-text-tex-subtext, #777E99); + text-align: center; + font-family: Manrope; + font-size: 2rem; + font-style: normal; + font-weight: 500; + line-height: normal; + letter-spacing: 0.34px; + margin-bottom: 2rem; + position: relative; + padding: 0 2rem; + display: block; + width: 100%; + + &:before, &:after { + content: " "; + height: 2px; + width: calc((100vw - 1500px)/2); + background: #D9DBE4; + display: block; + position: absolute; + top: 50%; + } + + &:before { + left: 8rem; + } + + &:after { + right: 8rem; + } +} + +.persona_row { + display: flex; + justify-content: center; + gap: 50px; + position: relative; + padding-top: 20px; /* Add some padding to give space above the line */ +} + +.persona_row_mobile { + display: none; +} + +.persona_row_wrapper { + display: flex; + justify-content: center; + align-items: center; + background: inherit; /* Inherit the container's gradient */ +} + +.persona { + display: flex; + flex-direction: column; + align-items: center; + min-width: 365px; + margin: 2rem; + margin-bottom: 0; + height: 100%; + background: inherit; /* Allow persona card to have gradient */ +} + +.persona_img { + margin-bottom: 12px; + z-index: 1; +} + +.persona_img img { + height: 200px; +} + +.features { + display: flex; + flex-direction: column; + align-items: center; +} + +.featureItem { + display: flex; + padding: 0.625rem; + border-radius: 0.375rem; + background: white; + opacity: 0.8; + color: var(--primitives-text-tex-subtext, #777E99); + font-family: Manrope; + font-size: 1rem; + font-style: normal; + font-weight: 400; + line-height: normal; + letter-spacing: 0.01rem; + text-transform: capitalize; + margin: 0.5rem; + justify-content: center; + width: max-content; +} + +.card_gradient { + position: absolute; + right: 0; + left: 0; + bottom: 0; + height: 40%; + background: linear-gradient(to top, #EBEBEB 15.52%, transparent); +} + +.persona_bg_line { + width: 100%; + max-width: 900px; + height: 0; + border-bottom: 1px dashed #aaa; + background-size: contain; + position: absolute; + top: 28%; + margin: auto; +} + + +@media (max-width: 768px) { + .container { + padding: 4rem 2rem; + max-width: 100vw; + } + + .persona_heading { + font-size: 1.4rem; + } + .persona_row { + display: none !important; + } + .persona_row_mobile { + display: flex !important; + flex-direction: column; + align-items: center; + } + + .persona_row_wrapper { + display: block !important; + } + + .persona { + margin: 3rem auto; + } + + .card_gradient { + height: 15%; + } +} diff --git a/docs-website/src/pages/solutions/_components/QuickstartContent/index.js b/docs-website/src/pages/solutions/_components/QuickstartContent/index.js new file mode 100644 index 00000000000000..422a7b16cc216a --- /dev/null +++ b/docs-website/src/pages/solutions/_components/QuickstartContent/index.js @@ -0,0 +1,63 @@ +import React, { useEffect, useRef, useState } from "react"; +import clsx from "clsx"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +import styles from "./quickstartcontent.module.scss"; +import { motion, useScroll, useTransform} from 'framer-motion'; + +const QuickstartContent = ({ quickstartContent }) => { + const scrollableElement = useRef(null) + const { scrollYProgress } = useScroll({ + target: scrollableElement, + offset: ["start end", "end end"] + }) + const scaleBar = useTransform(scrollYProgress, [0, 0.2, .9, 1], [0, 0, .8, 1]); + const opacityBar = useTransform(scrollYProgress, [0, 0.2, 0.4], [0, 0, 1]); + + return ( +
+
+ + {quickstartContent.map((data, idx) => ( + +
+ +
+
+
{data.title}
+
+
+ ))} +
+
+ ); +}; + +export default QuickstartContent; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/_components/QuickstartContent/quickstartcontent.module.scss b/docs-website/src/pages/solutions/_components/QuickstartContent/quickstartcontent.module.scss new file mode 100644 index 00000000000000..bf9369929112af --- /dev/null +++ b/docs-website/src/pages/solutions/_components/QuickstartContent/quickstartcontent.module.scss @@ -0,0 +1,118 @@ +@media only screen and (max-width: 800px) { + .quickstart { + margin: 6rem auto!important; + width: 95vw!important; + } +} +.quickstart { + width: 80vw; + margin: 12rem auto; + display: flex; + flex-direction: column; + + :global { + + .quickstart__bar { + width: 4px; + height: 100%; + position: absolute; + + background: linear-gradient(180deg, #546167 0.71%, #58595f 95.37%, #69cfff 100%); + border-radius: 10px; + transform-origin: top; + // transition: transform 0.3s; + // animation: progress 0.3s linear; + + /* Inside auto layout */ + display: block; + } + + .quickstart__container { + position: relative; + } + + .quickstart__content { + display: flex; + margin: 3rem; + width: 100%; + + .quickstart__text { + width: 60%; + min-width: 600px; + padding-right: 2rem; + display: flex; + justify-content: center; + flex-direction: column; + + div { + padding-left: 1rem; + } + + .quickstart__text__head { + + /* H4 | Semibold */ + font-family: 'Manrope'; + font-style: normal; + font-weight: 400; + font-size: 2rem; + line-height: 2.5rem; + + color: #171B2B; + margin-top: 1rem; + } + + } + + .quickstart__img { + display: flex; + align-items: flex-start; + justify-content: center; + width: 20%; + + img { + width: 2rem; + min-width: 100px; + } + } + } + + + @media only screen and (max-width: 800px) { + .quickstart__bar { + display: none; + } + + .quickstart__container { + max-width: 95% !important; + } + + .quickstart__content { + display: flex; + margin: 2rem; + + .quickstart__text { + min-width: 0; + width: 100%; + padding-left: 40px; + padding-right: 0; + + div { + padding-left: 0; + } + .quickstart__text__head { + font-size: 1.25rem; + font-weight: 400; + line-height: 1.75rem; + margin: 0 !important; + padding-right: 1.5rem; + } + } + .quickstart__img { + display: flex; + min-width: 50px; + margin: auto; + } + } + } + } +} diff --git a/docs-website/src/pages/solutions/_components/SlidingTabs/index.js b/docs-website/src/pages/solutions/_components/SlidingTabs/index.js new file mode 100644 index 00000000000000..2dca5f63ae765a --- /dev/null +++ b/docs-website/src/pages/solutions/_components/SlidingTabs/index.js @@ -0,0 +1,69 @@ +import React, { useState } from 'react'; +import styles from './styles.module.scss'; +import clsx from 'clsx'; + +const TabbedComponent = () => { + const [activeTab, setActiveTab] = useState(0); + + const tabs = [ + { + title: 'Deploy with enterprise-grade security', + description: 'Acryl Observe deploys and runs in your own VPC, offering pre-configured support for advanced networking features—like AWS PrivateLink, or Private Google Access—to facilitate secure, private connectivity to cloud services. Plus, both Observe and Acryl Cloud are certified to meet rigorous compliance and security standards, like SOC 2.', + icon: "/img/solutions/lock.png", + }, + { + title: 'Scale from Zero to Infinity', + description: 'Acryl Observe is built for any scale. Leveraging the power of Acryl Cloud, Observe can scale to support data warehouses with petabytes of data in tens of thousands of tables—and tens of billions of rows. And because it’s a fully managed SaaS offering, it’s also ideal for small organizations still building out their data ecosystems.', + icon: "/img/solutions/rocket-launch.png", + }, + { + title: 'Reduce tool clutter and operational burden', + description: 'Simplify your stack. Avoid duplication across tools by unifying data discovery, data governance, and data quality into one central tool. Skip spending countless engineering hours maintaining inaccessible, code-first data quality frameworks', + icon: "/img/solutions/communities.png", + }, + { + title: 'Reduce the risk of vendor lock-in', + description: 'Get the benefits of open source in a fully managed, limitlessly scalable SaaS offering. Acryl Observe and Acryl Cloud are built on top of the DataHub Project, proven open-source technology with an active, thriving community of contributors and users. Customers get 100% compatibility with open-source DataHub, plus regular updates and improvements, source code transparency, community-based support, proven security, and protection against vendor lock-in.', + icon: "/img/solutions/water-lock.png", + } + ]; + + return ( +
+
+
+
+ Secure. Scalable.
Simple. Open. +
+
+ {tabs.map((tab, index) => ( + +
+ + {activeTab === index && ( +
+ {tab.description} +
+ )} +
+
+ ))} +
+
+
+
+
+
+
+ ); +}; + +export default TabbedComponent; diff --git a/docs-website/src/pages/solutions/_components/SlidingTabs/styles.module.scss b/docs-website/src/pages/solutions/_components/SlidingTabs/styles.module.scss new file mode 100644 index 00000000000000..982234a2b721ec --- /dev/null +++ b/docs-website/src/pages/solutions/_components/SlidingTabs/styles.module.scss @@ -0,0 +1,171 @@ +.tabbedComponent { + padding-top: 48px; + display: flex; + flex-direction: column; + align-items: center; +} + +.leftSection { + padding-left: 4rem; +} + +.title { + color: #373A47; + font-family: Manrope; + font-size: 3.6rem; + font-style: normal; + font-weight: 500; + line-height: 4.5rem; + letter-spacing: -0.18rem; + text-align: left; + margin-bottom: 3rem; + + .titleBlue { + color: var(--semantics-border-default, #1890FF); + } +} + +.container { + display: flex; + flex-direction: column; // Changed to column for mobile view + background: white; + overflow: hidden; + width: 100vw; +} + +.tabs { + display: flex; + flex-direction: column; + justify-content: flex-start; + text-align: left !important; +} + +.tab { + align-items: center; + margin: 0.5rem 0; + position: relative; + + &.activeTab { + border-left: 3px solid #4C49E4; + .tabTitle { + color: black !important; + } + .icon { + filter: brightness(0); + } + + } + + .tabButton { + padding: 0rem 1rem 1rem 1rem; + background: none; + border: none; + cursor: pointer; + display: flex; + align-items: center; + width: 100%; + justify-content: left; + text-align: left; + + .tabTitle { + color: var(--primitives-text-tex-subtext, #777E99); + font-family: Manrope; + font-size: 1.45rem; + font-style: normal; + font-weight: 600; + line-height: normal; + padding-left: 1rem; + transition: all .3s; + } + .icon { + transition: all .3s; + } + &:hover { + .tabTitle { + color: black; + } + .icon { + filter: brightness(0); + } + } + } + + .dropdown { + background-color: #ffffff; + margin-top: 5px; + padding: 0rem 1.5rem 0.5rem 1.5rem; + color: #777E99; + font-family: Manrope; + font-size: 1.25rem; + font-style: normal; + font-weight: 500; + line-height: 2rem; /* 160% */ + } +} + +.imageContainer { + justify-content: right; + background-color: transparent; + margin: 1rem 0; + height: 520px; + align-self: center; + width: 40%; + border-radius: 24px; + display: flex; + flex-grow: 1; +} + +.tabImage { + width: 100%; + height: 100%; + display: flex; + background-size: contain; + background-repeat: no-repeat; + background-position: right center; +} + + + +@media (min-width: 799px) { + .container { + flex-direction: row; // Change back to row for larger screens + padding: 40px 0px; + } + .tabs { + width: 800px; + } + + .imageContainer { + margin: 1rem 0rem 1rem 1rem; + } +} + +@media (max-width: 800px) { + .title { + font-size: 2.5rem; + line-height: 3rem; + } + .tabButton { + .icon { + height: 24px; + width: 24px; + } + .tabTitle { + font-size: 1rem!important; + padding-left: .25rem!important; + } + } + .dropdown { + font-size: .9rem!important; + line-height: 1.5rem!important; + } + + .imageContainer { + display: none !important; + } + + .leftSection { + padding: 2rem; + } + +} \ No newline at end of file diff --git a/docs-website/src/pages/solutions/_components/Testimonials/index.js b/docs-website/src/pages/solutions/_components/Testimonials/index.js new file mode 100644 index 00000000000000..54aaca0ebb5f85 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Testimonials/index.js @@ -0,0 +1,29 @@ +import React, { useEffect, useRef, useState } from "react"; +import clsx from "clsx"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +import styles from "./styles.module.scss"; + +const Testimonials = ({ testimonialsData }) => { + const { title, feature1, feature2, feature1Link, feature2Link, imgSrc } = testimonialsData; + return ( +
+
+
+
+
+ {title} +
+ + Seamlessly integrated with DataHub Cloud's
{feature1} and {feature2} solutions. +
+
+
+ +
+
+
+
+ ); +}; + +export default Testimonials; diff --git a/docs-website/src/pages/solutions/_components/Testimonials/styles.module.scss b/docs-website/src/pages/solutions/_components/Testimonials/styles.module.scss new file mode 100644 index 00000000000000..ce49d7b4f6fa11 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Testimonials/styles.module.scss @@ -0,0 +1,99 @@ +.testimonials { + background: linear-gradient(90deg, #F8F9F9 0%, #E9EAEC 100%); + + :global { + .testimonials__content { + width: 80vw; + max-width: 1200px; + margin: 0 auto; + position: relative; + padding: 4.5rem 0; + + .testimonials__card { + margin: 0rem 4rem; + display: flex; + + .testimonials__logo { + min-width: 100px; + margin: 2rem auto; + img { + max-width: 100%; + } + } + + .testimonials__text { + width: 94%; + padding-left: 2rem; + color: #2e2e38; + + .testimonials__quote_title { + font-family: "Manrope"; + font-style: normal; + font-weight: 500; + font-size: 3.5rem; + line-height: 120%; + position: relative; + margin: 1.8rem auto; + } + + .testimonials__quote_description { + font-size: 1.8rem; + margin-top: 0.5rem; + color: #656c77; + font-weight: 300; + line-height: normal; + } + + .testimonials__quote_black { + color: #2E2E38; + + &:hover { + text-decoration: none; + opacity: 0.8; + cursor: pointer; + } + } + } + } + } + + @media only screen and (max-width: 800px) { + .testimonials__content { + width: 100vw; + padding: 2rem 0; + text-align: center; + + .testimonials__card { + flex-direction: column; + margin: 2rem 1rem; + + .testimonials__logo { + max-width: 40px; + margin-bottom: 16px; + } + + .testimonials__text { + width: 100%; + padding-left: 0; + max-width: 100%; + .testimonials__quote_title { + font-size: 2.25rem; + } + .testimonials__quote_description { + font-size: 1.5rem; + } + .testimonials__quote { + font-size: 1.1rem; + line-height: 1.75rem; + } + .testimonials__company { + font-size: 1rem; + } + } + } + } + + } + } + } + \ No newline at end of file diff --git a/docs-website/src/pages/solutions/_components/Tiles/index.js b/docs-website/src/pages/solutions/_components/Tiles/index.js new file mode 100644 index 00000000000000..3b087e97ae20bb --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Tiles/index.js @@ -0,0 +1,80 @@ +import React from "react"; +import styles from "./tiles.module.scss"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +import clsx from "clsx"; + +const Tiles = ({ tilesContent }) => { + const { title, theme, tileItems } = tilesContent; + + const sectionThemeClass = theme === "dark" ? styles.darkSection : styles.lightSection; + const itemThemeClass = theme === "dark" ? styles.darkItem : styles.lightItem; + const diagramItemThemeClass = theme === "dark" ? styles.darkDiagramItem : styles.lightDiagramItem; + + return ( +
+
+
+
+
+
+
+
+ {tileItems.map((item, index) => ( +
+ {index % 2 !== 0 ? ( + <> +
+ {item.title} +
+
+
+
{item.title}
+
{item.subtitle}
+
+
+ + ) : ( + <> +
+
+
{item.title}
+
{item.subtitle}
+
+
+
+ {item.title} +
+ + )} +
+ ))} +
+
+
+
+
+ ); +}; + +export default Tiles; diff --git a/docs-website/src/pages/solutions/_components/Tiles/tiles.module.scss b/docs-website/src/pages/solutions/_components/Tiles/tiles.module.scss new file mode 100644 index 00000000000000..a71b0b8445541d --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Tiles/tiles.module.scss @@ -0,0 +1,203 @@ +.lightSection { + background: #FAFAFA; +} + +.darkSection { + background: white; +} + +.ecosystem_section { + padding: 4vh 0; + padding-bottom: 6vh; + display: flex; + justify-content: center; + + .ecosystem_section_content { + width: 70%; + height: 100%; + display: flex; + flex-direction: column; + justify-content: center; + padding: 2rem 0rem; + text-align: left; + + .ecosystem_section_upper_content { + padding-top: 1rem; + padding-left: 0; + display: flex; + justify-content: flex-start; + + .ecosystem_section_heading { + line-height: 4rem; + font-family: Manrope; + font-family: Manrope; + font-size: 3.25rem; + font-style: normal; + font-weight: 500; + line-height: normal; + mix-blend-mode: luminosity; + width: 100%; + } + } + + .ecosystem_section_lower_content { + margin-top: 48px; + display: flex; + flex-direction: row; + justify-content: center; + + .itemWrappers { + display: flex; + flex-direction: column; + align-self: stretch; + + .itemWrapper { + display: flex; + padding: 4rem 0px; + width: 100%; + align-items: center; + justify-content: space-between; + flex-direction: row; + flex-wrap: nowrap; + } + + .alternate { + .item { + margin-right: 2rem; + margin-left: 0; + } + } + + .item { + display: flex; + justify-content: center; + align-items: flex-start; + padding: 0; + margin: 0; + margin-left: 4rem; + flex-grow: 1; + max-width: 520px; + + .item__title { + font-family: Manrope; + font-size: 2.25rem; + font-style: normal; + font-weight: 500; + line-height: normal; + margin-bottom: 1rem; + } + + .item__subtitle { + font-family: Manrope; + font-size: 1.25rem; + font-style: normal; + font-weight: 400; + line-height: normal; + color: #777E99; + } + } + + .diagramItem { + display: flex; + overflow: hidden; + height: 400px; + min-width: 400px; + max-width: 400px; + justify-content: flex-end; + align-items: center; + border-radius: var(--number-scales-2s-20, 32px); + border: 0.5px solid #1890FF; + background: #FAFAFA; + } + } + } + + .item, .diagramItem { + margin: 8px; + } + } +} +@media only screen and (max-width: 800px) { + .ecosystem_section { + padding: 1rem 0; + .ecosystem_section_content { + width: 90%; + min-width: 0; + height: auto !important; + padding-bottom: 48px; + padding-top: 48px; + + .ecosystem_section_upper_content { + height: auto !important; + + .ecosystem_section_heading { + font-size: 1.75rem; + line-height: 2.25rem; + font-weight: 600; + text-align: center; + width: 90%; + margin: auto; + } + } + + .ecosystem_section_lower_content { + height: auto; + margin-top: 8px; + + .diagramItem { + width: 100%; + height: auto; + border: none; + } + .itemWrappers { + .itemWrapper { + flex-direction: column-reverse; + &.alternate { + flex-direction: column; + } + width: 95vw; + padding: 2rem 0; + align-items: center; + margin: auto; + + } + + .diagramItem { + height: auto; + width: 95vw; + aspect-ratio: 1; + min-width: auto; + margin: auto; + } + + .item { + text-align: center; + padding : 1rem; + margin: 2rem 0!important; + + .item__title { + font-size: 1.5rem; + } + .item__subtitle { + font-size: 1rem; + } + } + } + } + } + } +} + +.darkDiagramItem { + background: linear-gradient(180deg, #F5EFF8 0%, #F1F4F4 100%); +} + +.lightDiagramItem { + background: linear-gradient(37deg, rgba(238, 240, 242, 0.90) 2.81%, rgba(253, 249, 241, 0.90) 107.71%); +} + +.diagramItem__img { + overflow: hidden; + width: 100%; + height: auto; +} diff --git a/docs-website/src/pages/solutions/_components/Trials/index.js b/docs-website/src/pages/solutions/_components/Trials/index.js new file mode 100644 index 00000000000000..ecca6685810b83 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Trials/index.js @@ -0,0 +1,79 @@ +import React from "react"; +import styles from "./styles.module.scss"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +import clsx from "clsx"; +import Link from "@docusaurus/Link"; + +const Trials = ({ onOpenTourModal, trialsContent }) => { + const { title, trialsCardItems } = trialsContent; + + return ( +
+
+
+
+ Discover. Observe. Govern. +

{title}

+
+ Book a Demo + Product Tour +
+ + Get started with Open Source → + +
+
+
+
+
+
+
+ {trialsCardItems.slice(0, 2).map((item, index) => ( +
+ {item.title} + {item.title.split("\n").map((line, idx) => ( + + {line} +
+
+ ))} +
+ ))} +
+
+ {trialsCardItems.slice(2).map((item, index) => ( +
+ {item.title} + {item.title.split("\n").map((line, idx) => ( + + {line} +
+
+ ))} +
+ ))} +
+
+
+
+
+ ); +}; + +export default Trials; diff --git a/docs-website/src/pages/solutions/_components/Trials/styles.module.scss b/docs-website/src/pages/solutions/_components/Trials/styles.module.scss new file mode 100644 index 00000000000000..9a991c4af36abc --- /dev/null +++ b/docs-website/src/pages/solutions/_components/Trials/styles.module.scss @@ -0,0 +1,259 @@ +.container { + display: flex; + flex-direction: column; + background: #fafafa; + width: 80vw; + min-width: 900px; + max-width: 1200px; + margin: 0 auto; +} +.trial { + height: 600px; + background: white; + display: flex; + justify-content: space-between; + + .trial_left { + width: 55%; + height: 100%; + display: flex; + justify-content: center; + align-items: center; + + .left_content { + flex-grow: 1; + padding-right: 24px; + + .trial_title { + font-weight: 300; + } + + span { + color: #8088a3; + font-size: 1.5rem; + font-weight: 400; + } + p { + color: #2e2e38; + font-size: 3rem; + font-weight: 400; + line-height: normal; + } + .btn_div { + display: flex; + gap: 1rem; + margin-bottom: 1rem; + + a:first-child { + cursor: pointer; + text-decoration: none; + display: inline-block; + font-size: 1rem; + background-color: #1890ff; + padding: 4px 20px; + border-radius: 50px; + margin: 0 0 0 0; + color: white; + transition: opacity .2s ease-in-out; + &:hover { + opacity: .9; + } + } + + a:nth-child(2) { + cursor: pointer; + text-decoration: none; + display: inline-block; + font-size: 1rem; + padding: 4px 20px; + border-radius: 50px; + margin: 0 0 0 0; + background-color: transparent; + color: #1890ff; + border: 1px solid #1890ff; + transition: background-color .2s ease-in-out; + &:hover { + background-color: #1890ff1A; + } + } + } + + .start_arrow { + margin-top: .5rem; + display: inline-block; + font-size: 1.1rem; + color: #1890ff; + font-weight: 500; + cursor: pointer; + text-decoration: none; + } + } + } + .trial_right { + width: 45%; + max-width: 520px; + height: 100%; + position: relative; + overflow: hidden; + + .gradientTop, .gradientBottom { + position: absolute; + background: linear-gradient(#FFFFFF, #FFFFFF00); + height: 100px; + width: 100%; + z-index: 10; + } + .gradientBottom { + transform: rotate(180deg); + bottom: 0; + } + + .right_content { + height: 100%; + display: flex; + gap: 2rem; + .right_l { + width: 50%; + height: 95%; + margin-top: -.5rem; + display: flex; + flex-direction: column; + gap: 1rem; + + div { + font-size: 1.25rem; + font-weight: 400; + color: #000; + line-height: 1.5rem; + img { + margin-bottom: 8px; + } + } + + div:first-child { + display: flex; + flex-direction: column; + justify-content: center; + padding-left: 2rem; + padding-right: 1.5rem; + height: 49%; + border-radius: 35px; + background-color: #fff; + border: 1px solid #ddd; + // background-image: linear-gradient(to bottom, #FC526333, #FC5263); + } + + div:nth-child(2) { + display: flex; + flex-direction: column; + justify-content: center; + padding-left: 2rem; + padding-right: 1.5rem; + background-color: #fff; + border: 1px solid #ddd; + // background-image: linear-gradient( + // to bottom, + // #77B750, + // #77B75080 + // ); + height: 49%; + border-radius: 35px; + } + } + .right_r { + width: 50%; + height: calc(100% - 2rem); + margin-top: 2.5rem; + display: flex; + flex-direction: column; + gap: 1rem; + + div { + font-size: 1.25rem; + font-weight: 400; + color: #000; + line-height: 1.5rem; + img { + margin-bottom: 8px; + } + } + + div:first-child { + display: flex; + flex-direction: column; + justify-content: center; + padding-left: 2rem; + padding-right: 1.5rem; + // background-image: linear-gradient(to bottom, #1890FF80, #1890FF); + background-color: #fff; + border: 1px solid #ddd; + height: 49%; + border-radius: 35px; + } + + div:nth-child(2) { + display: flex; + flex-direction: column; + justify-content: center; + padding-left: 2rem; + padding-right: 1.5rem; + background-color: #fff; + border: 1px solid #ddd; + // background-image: linear-gradient( + // to bottom, + // #EFB300, + // #EFB30033 + // ); + height: 49%; + border-radius: 35px; + } + } + } + } +} + +// Responsiveness +@media (max-width: 800px) { + + .container { + flex-direction: column; + width: 90vw; + min-width: 0; + margin: 4rem auto; + } + .trial { + flex-direction: column; + justify-content: flex-start; + height: auto; + .trial_left { + width: 100%; + .left_content { + span { + font-size: 1.3rem; + } + p { + font-size: 2.5rem; + } + } + .start_arrow { + margin-top: 0; + margin-bottom: 1rem; + } + } + .trial_right { + width: 100%; + max-width: none; + height: 500px; + margin: auto; + } + .right_content { + .right_l, .right_r { + div { + font-size: .9rem!important; + line-height: 1.2rem !important; + color: #333!important; + } + } + } + } +} diff --git a/docs-website/src/pages/solutions/_components/UnifiedTabs/index.js b/docs-website/src/pages/solutions/_components/UnifiedTabs/index.js new file mode 100644 index 00000000000000..0ccf0501670a34 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/UnifiedTabs/index.js @@ -0,0 +1,42 @@ +import React, { useState } from 'react'; +import styles from './styles.module.scss'; +import clsx from 'clsx'; +import useBaseUrl from '@docusaurus/useBaseUrl'; + +const TabbedComponent = ({ unifiedTabsData }) => { + const [activeTab, setActiveTab] = useState(0); + + return ( +
+
+
+ {unifiedTabsData.map((tab, index) => ( +
setActiveTab(index)} + > +
+ {tab.tabName} +
+
+ ))} +
+
+
+
+
{unifiedTabsData[activeTab].title}
+
{unifiedTabsData[activeTab].description}
+
+ +
+
+ {unifiedTabsData[activeTab].imagetabName} +
+
+
+
+ ); +}; + +export default TabbedComponent; diff --git a/docs-website/src/pages/solutions/_components/UnifiedTabs/styles.module.scss b/docs-website/src/pages/solutions/_components/UnifiedTabs/styles.module.scss new file mode 100644 index 00000000000000..a3a1a455af3d35 --- /dev/null +++ b/docs-website/src/pages/solutions/_components/UnifiedTabs/styles.module.scss @@ -0,0 +1,124 @@ +.tabbedComponent { + text-align: left; + padding: 20px; + padding-top: 48px; + padding-bottom: 48px; + display: flex; + flex-direction: column; + align-items: center; +} + +.tabsContainer { + width: 100%; // Ensure full width + max-width: 1200px; // Limit to the same max-width as content + margin-bottom: 1rem; + display: flex; +} + +.tabs { + display: flex; + justify-content: space-between; // Space out tabs across the container + width: 100%; // Ensure tabs take full width +} + +.tabButton { + flex: 1; + padding: 0.8rem 1.5rem; + background: none; + border: none; + cursor: pointer; + font-size: 1.5rem; + font-weight: 500; + color: #777e99; + text-align: center; + + + .tabButtonText { + width: min-content; + margin: auto; + transition: color 0.2s, border-bottom 0.2s; + } + .tabButtonText:hover, .active { + color: #1890ff; + border-bottom: 3px solid #1890ff; + } +} + +.container { + display: flex; + flex-direction: row; + background: white; + max-width: 1200px; // Same max-width as tabs + width: 100%; // Full width to align with tabs + overflow: hidden; +} + +.tabContent { + flex: 1; + padding: 2rem; + margin: auto; +} + +.tabTitle { + font-size: 2.5rem; + font-weight: 500; + color: #373A47; + line-height: 120%; + margin-bottom: 1rem; +} + +.tabTitle.active { + color: #1890ff; + text-decoration: underline; +} + +.tabDescription { + font-size: 1.4rem; + color: #777e99; + line-height: 2rem; + font-weight: 300; +} + +.imageContainer { + display: flex; + justify-content: center; + align-items: center; + margin: 1rem 0; + width: 30rem; +} + +.tabImage { + width: 100%; + height: 100%; + display: flex; + align-items: center; + justify-content: center; + box-shadow: 0px 2px 4px 0px #0000001C; + border-radius: 2.2rem; +} + +@media (min-width: 768px) { + .container { + flex-direction: row; + padding: 40px 32px; + } + + .imageContainer { + margin: 1rem; + } +} + +@media (max-width: 768px) { + .container { + display: block; + } + + .imageContainer { + margin: auto; + max-width: 100%; + } + + .tabTitle { + font-size: 1.8rem; + } +} \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/_content/discoveryCaseStudyContent.js b/docs-website/src/pages/solutions/discovery/_content/discoveryCaseStudyContent.js new file mode 100644 index 00000000000000..95bbfde0bd12a3 --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/_content/discoveryCaseStudyContent.js @@ -0,0 +1,26 @@ +const caseStudyContent = { + title: "See how industry leaders are using DataHub for Discovery today.", + backgroundColor: "#F3F3F6", + items: [ + { + imgSrc: "/img/solutions/logo-notion.png", + title: "How Notion Used DataHub to harness their sprawling data.", + link: "https://www.notion.so/blog/a-brief-history-of-notions-data-catalog", + alt: "notion" + }, + { + imgSrc: "/img/solutions/logo-myob.png", + title: "How MYOB eliminated breaking changes with DataHub.", + link: "/adoption-stories/#myob", + alt: "MYOB", + }, + { + imgSrc: "/img/solutions/logo-dpg-media.png", + title: "How Acryl Data Helped DPG Media Save 25% Per Mo in Snowflake.", + link: "/adoption-stories/#dpg-media", + alt: "DPG Media", + } + ] + }; + +export default caseStudyContent; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/_content/discoveryHeroContent.js b/docs-website/src/pages/solutions/discovery/_content/discoveryHeroContent.js new file mode 100644 index 00000000000000..085e94085126b1 --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/_content/discoveryHeroContent.js @@ -0,0 +1,8 @@ +const heroContent = { + topQuote: "Discovery in DATAHUB", + title: "Make data \n\n democratization a reality", + description: "Enable everyone in your organization to effortlessly discover trustworthy data, tailor experiences for each persona, eliminate breaking changes with lineage, and build confidence in your data with a unified view of business and technical context.", + imgSrc: "/img/solutions/hero-discovery.png", + }; + + export default heroContent; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/_content/discoveryQuickstartContent.js b/docs-website/src/pages/solutions/discovery/_content/discoveryQuickstartContent.js new file mode 100644 index 00000000000000..e1c6a84512fd5e --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/_content/discoveryQuickstartContent.js @@ -0,0 +1,16 @@ +const quickstartData = [ + { + title: "Where can I find our quarterly revenue reporting?", + image: "/img/solutions/icon-revenue.png", + }, + { + title: "This metric looks wrong. How was it calculated?", + image: "/img/solutions/icon-metric.png", + }, + { + title: "What reports will be impacted during a data migration?", + image: "/img/solutions/icon-migration.png", + } +]; + +export default quickstartData; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/_content/discoveryTestimonialsContent.js b/docs-website/src/pages/solutions/discovery/_content/discoveryTestimonialsContent.js new file mode 100644 index 00000000000000..e9c72175ffcb77 --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/_content/discoveryTestimonialsContent.js @@ -0,0 +1,10 @@ +const testimonialsData = { + title: "Enter end-to-end Data Discovery.", + feature1: "Data Observability", + feature1Link: "/solutions/observability", + feature2: "Governance", + feature2Link: "/solutions/governance", + imgSrc: "/img/solutions/discovery-icons-group.png", +}; + +export default testimonialsData; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/_content/discoveryTilesContent.js b/docs-website/src/pages/solutions/discovery/_content/discoveryTilesContent.js new file mode 100644 index 00000000000000..c15042ea1c8db2 --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/_content/discoveryTilesContent.js @@ -0,0 +1,67 @@ +const tilesContent = [ + { + title: "Enable self-service
data discovery.", + theme: "dark", + tileItems: [ + { + title: "Your role, your view: discover data that matters to you.", + subtitle: "Tailor search experiences for every user type, from analysts to executives. Foster company-wide engagement and turn every employee into a data champion.", + imgSrc: "/img/solutions/discovery-tile-1.png", + }, + { + title: "Silence the irrelevant. Amplify what counts.", + subtitle: "Cut through data clutter to reveal the assets that truly move the needle for your organization.", + imgSrc: "/img/solutions/discovery-tile-2.png", + }, + { + title: "Maximize relevance. Minimize time-to-discovery.", + subtitle: "Tailor search results to reflect your organization's trust signals. Boost user confidence by prioritizing results that meet your standards of reliability and relevance.", + imgSrc: "/img/solutions/discovery-tile-3.png", + } + ] + }, + { + title: "Unlock the full potential of
automated data lineage.", + theme: "light", + tileItems: [ + { + title: "Demystify complex, cross-platform dependency chains.", + subtitle: "Find out when things go wrong, with alerts that reach your team where they work—whether it’s Slack, email, or anywhere else.", + imgSrc: "/img/solutions/discovery-tile-4.png", + }, + { + title: "Know your data’s Impact.", + subtitle: "Instantly identify downstream consumers of your data. Enable seamless communication and collaboration across your data ecosystem.", + imgSrc: "/img/solutions/discovery-tile-5.png", + }, + { + title: "Illuminate the black box of data transformations.", + subtitle: "Shine a light on how your key metrics are derived. Automated lineage provides transparency, fostering trust in your data-driven decisions.", + imgSrc: "/img/solutions/discovery-tile-6.png", + } + ] + }, + { + title: "Build trust in the relevance
and accuracy of your data.", + theme: "dark", + tileItems: [ + { + title: "Increase your data confidence.", + subtitle: "Tailor search experiences for every user type, from analysts to executives. Foster company-wide engagement and turn every employee into a data champion.", + imgSrc: "/img/solutions/discovery-tile-7.png", + }, + { + title: "Time travel through your data's evolution.", + subtitle: "View the shape and content of your data as it changes over time. Gain confidence in your current data by understanding its past.", + imgSrc: "/img/solutions/discovery-tile-8.png", + }, + { + title: "Your data quality companion, everywhere you work.", + subtitle: "Seamlessly integrate DataHub's insights into your BI tools and communication channels. Keep data trust at the forefront of every decision.", + imgSrc: "/img/solutions/discovery-tile-9.png", + } + ] + } +] + + export default tilesContent; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/_content/discoveryTrialsContent.js b/docs-website/src/pages/solutions/discovery/_content/discoveryTrialsContent.js new file mode 100644 index 00000000000000..b9b695549210d6 --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/_content/discoveryTrialsContent.js @@ -0,0 +1,24 @@ +const trialsContent = { + title: "Make data democratization a reality today.", + trialsCardItems: [ + { + title: "Unlock self-service data discovery.", + imgSrc: "/img/solutions/trial-icon-lock.svg", + + }, + { + title: "Stop breaking changes before they happen.", + imgSrc: "/img/solutions/trial-icon-alert.svg", + }, + { + title: "Build trust in the relevance and accuracy of your data.", + imgSrc: "/img/solutions/trial-icon-star.svg", + }, + { + title: "Unify Discovery, Observability and Governance in one tool.", + imgSrc: "/img/solutions/trial-icon-link.svg", + } + ] +}; + +export default trialsContent; \ No newline at end of file diff --git a/docs-website/src/pages/solutions/discovery/index.js b/docs-website/src/pages/solutions/discovery/index.js new file mode 100644 index 00000000000000..01efca1e0e7c6e --- /dev/null +++ b/docs-website/src/pages/solutions/discovery/index.js @@ -0,0 +1,62 @@ +import React, { useState } from "react"; +import Layout from "@theme/Layout"; +import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; +import Hero from "../_components/Hero"; +import QuickstartContent from "../_components/QuickstartContent"; +import Tiles from "../_components/Tiles"; +import Trials from "../_components/Trials"; +import Testimonials from "../_components/Testimonials"; +import CaseStudy from "../_components/CaseStudy"; +import CloseButton from "@ant-design/icons/CloseCircleFilled"; +import quickstartData from "./_content/discoveryQuickstartContent"; +import heroContent from "./_content/discoveryHeroContent"; +import caseStudyContent from "./_content/discoveryCaseStudyContent"; +import Integrations from "../_components/Integrations"; +import tilesContent from "./_content/discoveryTilesContent"; +import testimonialsData from "./_content/discoveryTestimonialsContent"; +import trialsContent from "./_content/discoveryTrialsContent"; + +function Home() { + const context = useDocusaurusContext(); + const { siteConfig = {} } = context; + + if (siteConfig.customFields.isSaas) { + window.location.replace("/docs"); + } + + const [isTourModalVisible, setIsTourModalVisible] = useState(false); + const onOpenTourModal = () => { + setIsTourModalVisible(true); + }; + const onCloseTourModal = () => { + setIsTourModalVisible(false); + }; + return !siteConfig.customFields.isSaas ? ( + + {isTourModalVisible ? ( +
+
+ +
+