Skip to content

Commit

Permalink
Spark: replicate implicit 'include all' in profiling consistently wit…
Browse files Browse the repository at this point in the history
…h other datasources (#2147)
  • Loading branch information
m1n0 authored Aug 15, 2024
1 parent ad9db79 commit 124e475
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 0 deletions.
27 changes: 27 additions & 0 deletions soda/core/tests/data_source/test_profile_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,30 @@ def test_profile_columns_capitalized(data_source_fixture: DataSourceFixture):
assert len(column_profiles) == 2
assert column_profiles[0]["columnName"] == column_casify("ITEMS_SOLD")
assert column_profiles[1]["columnName"] == column_casify("CST_Size")


@pytest.mark.parametrize(
"soda_cl_str",
[
pytest.param(
"""
profile columns:
columns:
- exclude %.id
""",
)
],
)
def test_profile_columns_implicit_include(data_source_fixture: DataSourceFixture, soda_cl_str):
_table_name1 = data_source_fixture.ensure_test_table(customers_profiling)
_table_name2 = data_source_fixture.ensure_test_table(customers_profiling_capitalized)
scan = data_source_fixture.create_test_scan()
mock_soda_cloud = scan.enable_mock_soda_cloud()
scan.add_sodacl_yaml_str(soda_cl_str.format(table_name1=_table_name1))
scan.execute(allow_error_warning=True)
scan_results = mock_soda_cloud.pop_scan_result()

profiled_tables = [table for table in scan_results["profiling"]]

# Two tables requested, make sure at least two are profiled.
assert len(profiled_tables) >= 2
9 changes: 9 additions & 0 deletions soda/spark/soda/data_sources/spark_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,15 @@ def get_tables_columns_metadata(
) -> dict[str, str] | None:
if (not include_patterns) and (not exclude_patterns):
return []

# Spark will not an implicit "include all" if no include patterns are provided like other datasources that match
# all tables because of the nature of the metadata query. If no include patterns are provided, we will simulate "include all"
# for the sake of consistency with other datasources.
if not include_patterns:
include_patterns = [{"table_name_pattern": "%", "column_name_pattern": "%"}]
if exclude_patterns is None:
exclude_patterns = []

included_table_names: list[str] = self.get_included_table_names(
query_name, include_patterns, exclude_patterns, table_names_only=table_names_only
)
Expand Down

0 comments on commit 124e475

Please sign in to comment.