From a18c72083d763b08282b67146881d4f918b257de Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 21 Dec 2023 13:50:39 -0500 Subject: [PATCH] feat(ingest): handle multiline string coercion (#9484) --- docs-website/download_historical_versions.py | 4 +- docs/developers.md | 6 +-- .../src/datahub/configuration/git.py | 12 +---- .../validate_multiline_string.py | 31 ++++++++++++ .../ingestion/source/bigquery_v2/lineage.py | 2 +- .../ingestion/source/looker/lookml_source.py | 7 ++- .../source_config/usage/bigquery_usage.py | 3 ++ .../src/datahub/utilities/logging_manager.py | 1 + .../unit/config/test_pydantic_validators.py | 50 +++++++++++++++---- 9 files changed, 86 insertions(+), 30 deletions(-) create mode 100644 metadata-ingestion/src/datahub/configuration/validate_multiline_string.py diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py index 53ee9cf1e63ef..7493210ffa2a5 100644 --- a/docs-website/download_historical_versions.py +++ b/docs-website/download_historical_versions.py @@ -37,9 +37,9 @@ def fetch_urls( except Exception as e: if attempt < max_retries: print(f"Attempt {attempt + 1}/{max_retries}: {e}") - time.sleep(retry_delay) + time.sleep(retry_delay * 2**attempt) else: - print(f"Max retries reached. Unable to fetch data.") + print("Max retries reached. Unable to fetch data.") raise diff --git a/docs/developers.md b/docs/developers.md index 60d31f5e4523f..fe007a56ddc68 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -17,10 +17,8 @@ title: "Local Development" On macOS, these can be installed using [Homebrew](https://brew.sh/). ```shell -# Install Java 8 and 11 -brew tap homebrew/cask-versions -brew install java11 -brew install --cask zulu8 +# Install Java +brew install openjdk@17 # Install Python brew install python@3.10 # you may need to add this to your PATH diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index a5f88744661a4..3c76c8da0d571 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -1,4 +1,3 @@ -import os import pathlib from typing import Any, Dict, Optional, Union @@ -6,6 +5,7 @@ from datahub.configuration.common import ConfigModel from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.configuration.validate_multiline_string import pydantic_multiline_string _GITHUB_PREFIX = "https://github.com/" _GITLAB_PREFIX = "https://gitlab.com/" @@ -92,15 +92,7 @@ class GitInfo(GitReference): description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.", ) - @validator("deploy_key_file") - def deploy_key_file_should_be_readable( - cls, v: Optional[FilePath] - ) -> Optional[FilePath]: - if v is not None: - # pydantic does existence checks, we just need to check if we can read it - if not os.access(v, os.R_OK): - raise ValueError(f"Unable to read deploy key file {v}") - return v + _fix_deploy_key_newlines = pydantic_multiline_string("deploy_key") @validator("deploy_key", pre=True, always=True) def deploy_key_filled_from_deploy_key_file( diff --git a/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py new file mode 100644 index 0000000000000..0baaf4f0264b9 --- /dev/null +++ b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py @@ -0,0 +1,31 @@ +from typing import Optional, Type, Union + +import pydantic + + +def pydantic_multiline_string(field: str) -> classmethod: + """If the field is present and contains an escaped newline, replace it with a real newline. + + This makes the assumption that the field value is never supposed to have a + r"\n" in it, and instead should only have newline characters. This is generally + a safe assumption for SSH keys and similar. + + The purpose of this helper is to make us more forgiving of small formatting issues + in recipes, without sacrificing correctness across the board. + """ + + def _validate_field( + cls: Type, v: Union[None, str, pydantic.SecretStr] + ) -> Optional[str]: + if v is not None: + if isinstance(v, pydantic.SecretStr): + v = v.get_secret_value() + v = v.replace(r"\n", "\n") + + return v + + # Hack: Pydantic maintains unique list of validators by referring its __name__. + # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264 + # This hack ensures that multiple field deprecated do not overwrite each other. + _validate_field.__name__ = f"{_validate_field.__name__}_{field}" + return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index eddd08c92b808..b44b06feb95af 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -175,7 +175,7 @@ def make_lineage_edges_from_parsing_result( table_name = str( BigQueryTableRef.from_bigquery_table( BigqueryTableIdentifier.from_string_name( - DatasetUrn.create_from_string(table_urn).get_dataset_name() + DatasetUrn.from_string(table_urn).name ) ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index b76bef49a7e6f..33079f3fd9ac1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -2060,10 +2060,9 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 ) logger.debug("Failed to process explore", exc_info=e) - processed_view_files = processed_view_map.get(model.connection) - if processed_view_files is None: - processed_view_map[model.connection] = set() - processed_view_files = processed_view_map[model.connection] + processed_view_files = processed_view_map.setdefault( + model.connection, set() + ) project_name = self.get_project_name(model_name) logger.debug(f"Model: {model_name}; Includes: {model.resolved_includes}") diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py index 5eb9c83236e4f..13abe73cc4e09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py @@ -11,6 +11,7 @@ from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.validate_field_removal import pydantic_removed_field +from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.ingestion.source.usage.usage_common import BaseUsageConfig from datahub.ingestion.source_config.bigquery import BigQueryBaseConfig @@ -44,6 +45,8 @@ class BigQueryCredential(ConfigModel): description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", ) + _fix_private_key_newlines = pydantic_multiline_string("private_key") + @pydantic.root_validator(skip_on_failure=True) def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("client_x509_cert_url") is None: diff --git a/metadata-ingestion/src/datahub/utilities/logging_manager.py b/metadata-ingestion/src/datahub/utilities/logging_manager.py index a8eacb0a9938d..62aa1ca7ab791 100644 --- a/metadata-ingestion/src/datahub/utilities/logging_manager.py +++ b/metadata-ingestion/src/datahub/utilities/logging_manager.py @@ -199,6 +199,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N for handler in handlers: root_logger.removeHandler(handler) for lib in DATAHUB_PACKAGES: + lib_logger = logging.getLogger(lib) lib_logger.removeHandler(handler) lib_logger.propagate = True diff --git a/metadata-ingestion/tests/unit/config/test_pydantic_validators.py b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py index 399245736805c..f687a2776f6e2 100644 --- a/metadata-ingestion/tests/unit/config/test_pydantic_validators.py +++ b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py @@ -1,12 +1,14 @@ from typing import Optional +import pydantic import pytest from pydantic import ValidationError -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ConfigModel, ConfigurationWarning from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.utilities.global_warning_util import ( clear_global_warnings, get_global_warnings, @@ -22,8 +24,9 @@ class TestModel(ConfigModel): v = TestModel.parse_obj({"b": "original"}) assert v.b == "original" - v = TestModel.parse_obj({"a": "renamed"}) - assert v.b == "renamed" + with pytest.warns(ConfigurationWarning, match="a is deprecated"): + v = TestModel.parse_obj({"a": "renamed"}) + assert v.b == "renamed" with pytest.raises(ValidationError): TestModel.parse_obj({"a": "foo", "b": "bar"}) @@ -44,9 +47,10 @@ class TestModel(ConfigModel): assert v.b == "original" assert v.b1 == "original" - v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"}) - assert v.b == "renamed" - assert v.b1 == "renamed" + with pytest.warns(ConfigurationWarning, match=r"a.* is deprecated"): + v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"}) + assert v.b == "renamed" + assert v.b1 == "renamed" with pytest.raises(ValidationError): TestModel.parse_obj({"a": "foo", "b": "bar", "b1": "ok"}) @@ -74,8 +78,9 @@ class TestModel(ConfigModel): v = TestModel.parse_obj({"b": "original"}) assert v.b == "original" - v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"}) - assert v.b == "original" + with pytest.warns(ConfigurationWarning, match=r"r\d was removed"): + v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"}) + assert v.b == "original" def test_field_deprecated(): @@ -92,7 +97,10 @@ class TestModel(ConfigModel): v = TestModel.parse_obj({"b": "original"}) assert v.b == "original" - v = TestModel.parse_obj({"b": "original", "d1": "deprecated", "d2": "deprecated"}) + with pytest.warns(ConfigurationWarning, match=r"d\d.+ deprecated"): + v = TestModel.parse_obj( + {"b": "original", "d1": "deprecated", "d2": "deprecated"} + ) assert v.b == "original" assert v.d1 == "deprecated" assert v.d2 == "deprecated" @@ -100,3 +108,27 @@ class TestModel(ConfigModel): assert any(["d2 is deprecated" in warning for warning in get_global_warnings()]) clear_global_warnings() + + +def test_multiline_string_fixer(): + class TestModel(ConfigModel): + s: str + m: Optional[pydantic.SecretStr] = None + + _validate_s = pydantic_multiline_string("s") + _validate_m = pydantic_multiline_string("m") + + v = TestModel.parse_obj({"s": "foo\nbar"}) + assert v.s == "foo\nbar" + + v = TestModel.parse_obj({"s": "foo\\nbar"}) + assert v.s == "foo\nbar" + + v = TestModel.parse_obj({"s": "normal", "m": "foo\\nbar"}) + assert v.s == "normal" + assert v.m + assert v.m.get_secret_value() == "foo\nbar" + + v = TestModel.parse_obj({"s": "normal", "m": pydantic.SecretStr("foo\\nbar")}) + assert v.m + assert v.m.get_secret_value() == "foo\nbar"