diff --git a/docs/advanced/patch.md b/docs/advanced/patch.md index 601d055659313..24e8c68a9168d 100644 --- a/docs/advanced/patch.md +++ b/docs/advanced/patch.md @@ -1,69 +1,120 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# But First, Semantics: Upsert versus Patch +# Emitting Patch Updates to DataHub ## Why Would You Use Patch -By default, most of the SDK tutorials and API-s involve applying full upserts at the aspect level. This means that typically, when you want to change one field within an aspect without modifying others, you need to do a read-modify-write to not overwrite existing fields. -To support these scenarios, DataHub supports PATCH based operations so that targeted changes to single fields or values within arrays of fields are possible without impacting other existing metadata. +By default, most of the SDK tutorials and APIs involve applying full upserts at the aspect level, e.g. replacing the aspect entirely. +This means that when you want to change even a single field within an aspect without modifying others, you need to do a read-modify-write to avoid overwriting existing fields. +To support these scenarios, DataHub supports `PATCH` operations to perform targeted changes for individual fields or values within arrays of fields are possible without impacting other existing metadata. :::note -Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). In the near future, we do have plans to automatically support PATCH semantics for aspects by default. +Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). ::: -## How To Use Patch +## How To Use Patches -Examples for using Patch are sprinkled throughout the API guides. Here's how to find the appropriate classes for the language for your choice. - - + -The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. +The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +Patch builder helper classes exist for -Here are a few illustrative examples using the Java Patch builders: +- [Datasets](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataset.py) +- [Charts](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/chart.py) +- [Dashboards](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dashboard.py) +- [Data Jobs (Tasks)](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/datajob.py) +- [Data Products](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataproduct.py) +And we are gladly accepting contributions for Containers, Data Flows (Pipelines), Tags, Glossary Terms, Domains, and ML Models. -### Add Custom Properties +### Add & Remove Owners for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAdd.java show_path_as_comment }} +To add & remove specific owners for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_owner_patch.py show_path_as_comment }} ``` -### Add and Remove Custom Properties +### Add & Remove Tags for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +To add & remove specific tags for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_tag_patch.py show_path_as_comment }} ``` -### Add Data Job Lineage +And for a specific schema field within the Dataset: -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_tag_patch.py show_path_as_comment }} ``` - - +### Add & Remove Glossary Terms for Dataset + +To add & remove specific glossary terms for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py show_path_as_comment }} +``` + +And for a specific schema field within the Dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py show_path_as_comment }} +``` + +### Add & Remove Structured Properties for Dataset -The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +To add & remove structured properties for a dataset: -Here are a few illustrative examples using the Python Patch builders: +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py show_path_as_comment }} +``` -### Add Properties to Dataset +### Add & Remove Upstream Lineage for Dataset + +To add & remove a lineage edge connecting a dataset to it's upstream or input at both the dataset and schema field level: ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py show_path_as_comment }} +``` + +### Add & Remove Read-Only Custom Properties for Dataset + +To add & remove specific custom properties for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} +``` + + + + +The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. + +### Add & Remove Read-Only Custom Properties + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +``` + +### Add Data Job Lineage + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} ``` -## How Patch works +## Advanced: How Patch works To understand how patching works, it's important to understand a bit about our [models](../what/aspect.md). Entities are comprised of Aspects which can be reasoned about as JSON representations of the object models. To be able to patch these we utilize [JsonPatch](https://jsonpatch.com/). The components of a JSON Patch are the path, operation, and value. @@ -73,9 +124,6 @@ which can be reasoned about as JSON representations of the object models. To be The JSON path refers to a value within the schema. This can be a single field or can be an entire object reference depending on what the path is. For our patches we are primarily targeting single fields or even single array elements within a field. To be able to target array elements by id, we go through a translation process of the schema to transform arrays into maps. This allows a path to reference a particular array element by key rather than by index, for example a specific tag urn being added to a dataset. -This is important to note that for some fields in our schema that are arrays which do not necessarily restrict uniqueness, this puts a uniqueness constraint on the key. -The key for objects stored in arrays is determined manually by examining the schema and a long term goal is to make these keys annotation driven to reduce the amount of code needed to support -additional aspects to be patched. There is a generic patch endpoint, but it requires any array field keys to be specified at request time, putting a lot of burden on the API user. #### Examples @@ -87,8 +135,7 @@ Breakdown: * `/upstreams` -> References the upstreams field of the UpstreamLineage aspect, this is an array of Upstream objects where the key is the Urn * `/urn:...` -> The dataset to be targeted by the operation - -A patch path for targeting a fine grained lineage upstream: +A patch path for targeting a fine-grained lineage upstream: `/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),foo)/urn:li:query:queryId/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created_upstream,PROD),bar)` @@ -118,7 +165,6 @@ using adds, but generally the most useful use case for patch is to add elements Remove operations require the path specified to be present, or an error will be thrown, otherwise they operate as one would expect. The specified path will be removed from the aspect. - ### Value Value is the actual information that will be stored at a path. If the path references an object then this will include the JSON key value pairs for that object. diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md index fe0d7e62dcde8..86b1b2c0c54da 100644 --- a/docs/api/tutorials/custom-properties.md +++ b/docs/api/tutorials/custom-properties.md @@ -74,7 +74,7 @@ The following code adds custom properties `cluster_name` and `retention_time` to ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py show_path_as_comment }} ``` @@ -128,7 +128,7 @@ The following code shows you how can add and remove custom properties in the sam ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_remove_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} ``` diff --git a/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py new file mode 100644 index 0000000000000..7231461fea322 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.add_custom_property("retention_time", "2 years") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py new file mode 100644 index 0000000000000..d0b9a866fde61 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_term(GlossaryTermAssociationClass(make_term_urn("term-to-add-id"))) +patch_builder.remove_term(make_term_urn("term-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_owner_patch.py b/metadata-ingestion/examples/library/dataset_add_owner_patch.py new file mode 100644 index 0000000000000..8d3130c09c4bb --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_owner_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_group_urn, make_user_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import OwnerClass, OwnershipTypeClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Owners +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_owner( + OwnerClass(make_user_urn("user-to-add-id"), OwnershipTypeClass.TECHNICAL_OWNER) +) +patch_builder.remove_owner(make_group_urn("group-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_properties.py b/metadata-ingestion/examples/library/dataset_add_properties.py deleted file mode 100644 index b72aac5b82800..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_properties.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .add_custom_property("retention_time", "2 years") - .build() - ): - emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py new file mode 100644 index 0000000000000..c1db9c91d13ec --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add + Remove Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.remove_custom_property("retention_time") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_remove_properties.py b/metadata-ingestion/examples/library/dataset_add_remove_properties.py deleted file mode 100644 index 7109c0264f971..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_remove_properties.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .remove_custom_property("retention_time") - .build() - ): - emitter.emit(patch_mcp) - - -log.info( - f"Added cluster_name property, removed retention_time property from dataset {dataset_urn}" -) diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties.py b/metadata-ingestion/examples/library/dataset_add_structured_properties.py deleted file mode 100644 index fc2c379340592..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_structured_properties.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - -# Create rest emitter -rest_emitter = DataHubRestEmitter(gms_server="http://localhost:8080") - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - - -for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_structured_property("io.acryl.dataManagement.replicationSLA", 12) - .build() -): - rest_emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py new file mode 100644 index 0000000000000..ef72ed58a4b82 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py @@ -0,0 +1,23 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add and Remove Structured Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_structured_property( + "urn:li:structuredProperty:retentionTimeInDays", 12 +) +patch_builder.remove_structured_property( + "urn:li:structuredProperty:customClassification" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_add_tag_patch.py new file mode 100644 index 0000000000000..0bc644d6865f6 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_tag_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_tag(TagAssociationClass(make_tag_urn("tag-to-add-id"))) +patch_builder.remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py new file mode 100644 index 0000000000000..0b4e5e39bf627 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py @@ -0,0 +1,62 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import ( + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageUpstreamTypeClass, + UpstreamClass, +) +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) +upstream_to_remove_urn = make_dataset_urn( + platform="s3", name="fct_users_old", env="PROD" +) +upstream_to_add_urn = make_dataset_urn(platform="s3", name="fct_users_new", env="PROD") + +# Create Dataset Patch to Add & Remove Upstream Lineage Edges +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.remove_upstream_lineage(upstream_to_remove_urn) +patch_builder.add_upstream_lineage( + UpstreamClass(upstream_to_add_urn, DatasetLineageTypeClass.TRANSFORMED) +) + +# ...And also include schema field lineage +upstream_field_to_add_urn = make_schema_field_urn(upstream_to_add_urn, "profile_id") +downstream_field_to_add_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.add_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_add_urn], + [downstream_field_to_add_urn], + ) +) + +upstream_field_to_remove_urn = make_schema_field_urn( + upstream_to_remove_urn, "profile_id" +) +downstream_field_to_remove_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.remove_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_remove_urn], + [downstream_field_to_remove_urn], + ) +) + +patch_mcps = patch_builder.build() + + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py new file mode 100644 index 0000000000000..3f8da2c143c92 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py @@ -0,0 +1,26 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_term( + GlossaryTermAssociationClass(make_term_urn("term-to-add-id")) +) +patch_builder.for_field("profile_id").remove_term( + "urn:li:glossaryTerm:term-to-remove-id" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py new file mode 100644 index 0000000000000..3075cac5320ae --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Tag for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_tag( + TagAssociationClass(make_tag_urn("tag-to-add-id")) +) +patch_builder.for_field("profile_id").remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp)