Skip to content

Commit

Permalink
feat(ingest): handle multiline string coercion (datahub-project#9484)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Dec 21, 2023
1 parent b80d2f4 commit a18c720
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 30 deletions.
4 changes: 2 additions & 2 deletions docs-website/download_historical_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def fetch_urls(
except Exception as e:
if attempt < max_retries:
print(f"Attempt {attempt + 1}/{max_retries}: {e}")
time.sleep(retry_delay)
time.sleep(retry_delay * 2**attempt)
else:
print(f"Max retries reached. Unable to fetch data.")
print("Max retries reached. Unable to fetch data.")
raise


Expand Down
6 changes: 2 additions & 4 deletions docs/developers.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@ title: "Local Development"
On macOS, these can be installed using [Homebrew](https://brew.sh/).

```shell
# Install Java 8 and 11
brew tap homebrew/cask-versions
brew install java11
brew install --cask zulu8
# Install Java
brew install openjdk@17

# Install Python
brew install [email protected] # you may need to add this to your PATH
Expand Down
12 changes: 2 additions & 10 deletions metadata-ingestion/src/datahub/configuration/git.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os
import pathlib
from typing import Any, Dict, Optional, Union

from pydantic import Field, FilePath, SecretStr, validator

from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.configuration.validate_multiline_string import pydantic_multiline_string

_GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/"
Expand Down Expand Up @@ -92,15 +92,7 @@ class GitInfo(GitReference):
description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.",
)

@validator("deploy_key_file")
def deploy_key_file_should_be_readable(
cls, v: Optional[FilePath]
) -> Optional[FilePath]:
if v is not None:
# pydantic does existence checks, we just need to check if we can read it
if not os.access(v, os.R_OK):
raise ValueError(f"Unable to read deploy key file {v}")
return v
_fix_deploy_key_newlines = pydantic_multiline_string("deploy_key")

@validator("deploy_key", pre=True, always=True)
def deploy_key_filled_from_deploy_key_file(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Optional, Type, Union

import pydantic


def pydantic_multiline_string(field: str) -> classmethod:
"""If the field is present and contains an escaped newline, replace it with a real newline.
This makes the assumption that the field value is never supposed to have a
r"\n" in it, and instead should only have newline characters. This is generally
a safe assumption for SSH keys and similar.
The purpose of this helper is to make us more forgiving of small formatting issues
in recipes, without sacrificing correctness across the board.
"""

def _validate_field(
cls: Type, v: Union[None, str, pydantic.SecretStr]
) -> Optional[str]:
if v is not None:
if isinstance(v, pydantic.SecretStr):
v = v.get_secret_value()
v = v.replace(r"\n", "\n")

return v

# Hack: Pydantic maintains unique list of validators by referring its __name__.
# https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
# This hack ensures that multiple field deprecated do not overwrite each other.
_validate_field.__name__ = f"{_validate_field.__name__}_{field}"
return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field)
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def make_lineage_edges_from_parsing_result(
table_name = str(
BigQueryTableRef.from_bigquery_table(
BigqueryTableIdentifier.from_string_name(
DatasetUrn.create_from_string(table_urn).get_dataset_name()
DatasetUrn.from_string(table_urn).name
)
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2060,10 +2060,9 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
)
logger.debug("Failed to process explore", exc_info=e)

processed_view_files = processed_view_map.get(model.connection)
if processed_view_files is None:
processed_view_map[model.connection] = set()
processed_view_files = processed_view_map[model.connection]
processed_view_files = processed_view_map.setdefault(
model.connection, set()
)

project_name = self.get_project_name(model_name)
logger.debug(f"Model: {model_name}; Includes: {model.resolved_includes}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
from datahub.configuration.source_common import EnvConfigMixin
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.configuration.validate_multiline_string import pydantic_multiline_string
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
from datahub.ingestion.source_config.bigquery import BigQueryBaseConfig

Expand Down Expand Up @@ -44,6 +45,8 @@ class BigQueryCredential(ConfigModel):
description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
)

_fix_private_key_newlines = pydantic_multiline_string("private_key")

@pydantic.root_validator(skip_on_failure=True)
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
if values.get("client_x509_cert_url") is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
for handler in handlers:
root_logger.removeHandler(handler)
for lib in DATAHUB_PACKAGES:
lib_logger = logging.getLogger(lib)
lib_logger.removeHandler(handler)
lib_logger.propagate = True

Expand Down
50 changes: 41 additions & 9 deletions metadata-ingestion/tests/unit/config/test_pydantic_validators.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Optional

import pydantic
import pytest
from pydantic import ValidationError

from datahub.configuration.common import ConfigModel
from datahub.configuration.common import ConfigModel, ConfigurationWarning
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.configuration.validate_multiline_string import pydantic_multiline_string
from datahub.utilities.global_warning_util import (
clear_global_warnings,
get_global_warnings,
Expand All @@ -22,8 +24,9 @@ class TestModel(ConfigModel):
v = TestModel.parse_obj({"b": "original"})
assert v.b == "original"

v = TestModel.parse_obj({"a": "renamed"})
assert v.b == "renamed"
with pytest.warns(ConfigurationWarning, match="a is deprecated"):
v = TestModel.parse_obj({"a": "renamed"})
assert v.b == "renamed"

with pytest.raises(ValidationError):
TestModel.parse_obj({"a": "foo", "b": "bar"})
Expand All @@ -44,9 +47,10 @@ class TestModel(ConfigModel):
assert v.b == "original"
assert v.b1 == "original"

v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"})
assert v.b == "renamed"
assert v.b1 == "renamed"
with pytest.warns(ConfigurationWarning, match=r"a.* is deprecated"):
v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"})
assert v.b == "renamed"
assert v.b1 == "renamed"

with pytest.raises(ValidationError):
TestModel.parse_obj({"a": "foo", "b": "bar", "b1": "ok"})
Expand Down Expand Up @@ -74,8 +78,9 @@ class TestModel(ConfigModel):
v = TestModel.parse_obj({"b": "original"})
assert v.b == "original"

v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"})
assert v.b == "original"
with pytest.warns(ConfigurationWarning, match=r"r\d was removed"):
v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"})
assert v.b == "original"


def test_field_deprecated():
Expand All @@ -92,11 +97,38 @@ class TestModel(ConfigModel):
v = TestModel.parse_obj({"b": "original"})
assert v.b == "original"

v = TestModel.parse_obj({"b": "original", "d1": "deprecated", "d2": "deprecated"})
with pytest.warns(ConfigurationWarning, match=r"d\d.+ deprecated"):
v = TestModel.parse_obj(
{"b": "original", "d1": "deprecated", "d2": "deprecated"}
)
assert v.b == "original"
assert v.d1 == "deprecated"
assert v.d2 == "deprecated"
assert any(["d1 is deprecated" in warning for warning in get_global_warnings()])
assert any(["d2 is deprecated" in warning for warning in get_global_warnings()])

clear_global_warnings()


def test_multiline_string_fixer():
class TestModel(ConfigModel):
s: str
m: Optional[pydantic.SecretStr] = None

_validate_s = pydantic_multiline_string("s")
_validate_m = pydantic_multiline_string("m")

v = TestModel.parse_obj({"s": "foo\nbar"})
assert v.s == "foo\nbar"

v = TestModel.parse_obj({"s": "foo\\nbar"})
assert v.s == "foo\nbar"

v = TestModel.parse_obj({"s": "normal", "m": "foo\\nbar"})
assert v.s == "normal"
assert v.m
assert v.m.get_secret_value() == "foo\nbar"

v = TestModel.parse_obj({"s": "normal", "m": pydantic.SecretStr("foo\\nbar")})
assert v.m
assert v.m.get_secret_value() == "foo\nbar"

0 comments on commit a18c720

Please sign in to comment.