diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index 5421a932dacce..c0a8d31bca4c0 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -907,6 +907,24 @@ Then define your class to return a list of custom properties, for example: add_properties_resolver_class: "." ``` +## Replace ExternalUrl +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|---------|---------------|---------------------------------------------| +| `input_pattern` | ✅ | string | | String or pattern to replace | +| `replacement` | ✅ | string | | Replacement string | + + +Matches the full/partial string in the externalUrl of the dataset properties and replace that with the replacement string + +```yaml +transformers: + - type: "replace_external_url" + config: + input_pattern: '\b\w*hub\b' + replacement: "sub" +``` + ## Simple Add Dataset domains ### Config Details | Field | Required | Type | Default | Description | diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 51a086fff77e4..674450999ad73 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -687,6 +687,7 @@ "add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:AddDatasetDataProduct", "simple_add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:SimpleAddDatasetDataProduct", "pattern_add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:PatternAddDatasetDataProduct", + "replace_external_url = datahub.ingestion.transformer.replace_external_url:ReplaceExternalUrl" ], "datahub.ingestion.sink.plugins": [ "file = datahub.ingestion.sink.file:FileSink", diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py new file mode 100644 index 0000000000000..c222450f87e63 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py @@ -0,0 +1,65 @@ +import copy +import re +from typing import Any, Dict, Optional, cast + +from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import Aspect +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.transformer.dataset_transformer import ( + DatasetPropertiesTransformer, +) +from datahub.metadata.schema_classes import DatasetPropertiesClass + + +class ReplaceExternalUrlConfig(ConfigModel): + input_pattern: str + replacement: str + + +class ReplaceExternalUrl(DatasetPropertiesTransformer): + """Transformer that clean the ownership URN.""" + + ctx: PipelineContext + config: ReplaceExternalUrlConfig + + def __init__( + self, + config: ReplaceExternalUrlConfig, + ctx: PipelineContext, + **resolver_args: Dict[str, Any], + ): + super().__init__() + self.ctx = ctx + self.config = config + self.resolver_args = resolver_args + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "ReplaceExternalUrl": + config = ReplaceExternalUrlConfig.parse_obj(config_dict) + return cls(config, ctx) + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + in_dataset_properties_aspect: DatasetPropertiesClass = cast( + DatasetPropertiesClass, aspect + ) + + if ( + not hasattr(in_dataset_properties_aspect, "externalUrl") + or not in_dataset_properties_aspect.externalUrl + ): + return cast(Aspect, in_dataset_properties_aspect) + else: + out_dataset_properties_aspect: DatasetPropertiesClass = copy.deepcopy( + in_dataset_properties_aspect + ) + + pattern = re.compile(self.config.input_pattern) + replacement = self.config.replacement + + out_dataset_properties_aspect.externalUrl = re.sub( + pattern, replacement, in_dataset_properties_aspect.externalUrl + ) + + return cast(Aspect, out_dataset_properties_aspect) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 3782eb0e275f3..89d4fcca8801c 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -72,9 +72,13 @@ ExtractOwnersFromTagsTransformer, ) from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus +from datahub.ingestion.transformer.pattern_cleanup_ownership import ( + PatternCleanUpOwnership, +) from datahub.ingestion.transformer.remove_dataset_ownership import ( SimpleRemoveDatasetOwnership, ) +from datahub.ingestion.transformer.replace_external_url import ReplaceExternalUrl from datahub.metadata.schema_classes import ( BrowsePathsClass, DatasetPropertiesClass, @@ -87,9 +91,6 @@ ) from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn -from src.datahub.ingestion.transformer.pattern_cleanup_ownership import ( - PatternCleanUpOwnership, -) def make_generic_dataset( @@ -3209,3 +3210,84 @@ def test_clean_owner_urn_transformation_should_not_remove_system_identifier( config: List[Union[re.Pattern, str]] = ["urn:li:corpuser:"] _test_clean_owner_urns(pipeline_context, in_owner_urns, config, in_owner_urns) + + +def test_replace_external_url_word_replace( + mock_datahub_graph, +): + pipeline_context: PipelineContext = PipelineContext( + run_id="test_replace_external_url" + ) + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + + output = run_dataset_transformer_pipeline( + transformer_type=ReplaceExternalUrl, + aspect=models.DatasetPropertiesClass( + externalUrl="https://github.com/datahub/looker-demo/blob/master/foo.view.lkml", + customProperties=EXISTING_PROPERTIES.copy(), + ), + config={"input_pattern": "datahub", "replacement": "starhub"}, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0].record + assert output[0].record.aspect + assert ( + output[0].record.aspect.externalUrl + == "https://github.com/starhub/looker-demo/blob/master/foo.view.lkml" + ) + + +def test_replace_external_regex_replace_1( + mock_datahub_graph, +): + pipeline_context: PipelineContext = PipelineContext( + run_id="test_replace_external_url" + ) + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + + output = run_dataset_transformer_pipeline( + transformer_type=ReplaceExternalUrl, + aspect=models.DatasetPropertiesClass( + externalUrl="https://github.com/datahub/looker-demo/blob/master/foo.view.lkml", + customProperties=EXISTING_PROPERTIES.copy(), + ), + config={"input_pattern": r"datahub/.*/", "replacement": "starhub/test/"}, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0].record + assert output[0].record.aspect + assert ( + output[0].record.aspect.externalUrl + == "https://github.com/starhub/test/foo.view.lkml" + ) + + +def test_replace_external_regex_replace_2( + mock_datahub_graph, +): + pipeline_context: PipelineContext = PipelineContext( + run_id="test_replace_external_url" + ) + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + + output = run_dataset_transformer_pipeline( + transformer_type=ReplaceExternalUrl, + aspect=models.DatasetPropertiesClass( + externalUrl="https://github.com/datahub/looker-demo/blob/master/foo.view.lkml", + customProperties=EXISTING_PROPERTIES.copy(), + ), + config={"input_pattern": r"\b\w*hub\b", "replacement": "test"}, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0].record + assert output[0].record.aspect + assert ( + output[0].record.aspect.externalUrl + == "https://test.com/test/looker-demo/blob/master/foo.view.lkml" + )