forked from datahub-project/datahub
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(tableau): ability to force extraction of table/column level lina…
…ge from SQL queries (datahub-project#9838)
- Loading branch information
Showing
9 changed files
with
508 additions
and
56 deletions.
There are no files selected for viewing
278 changes: 226 additions & 52 deletions
278
metadata-ingestion/src/datahub/ingestion/source/tableau.py
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
23 changes: 23 additions & 0 deletions
23
metadata-ingestion/src/datahub/sql_parsing/sql_parsing_result_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from typing import Dict, Set | ||
|
||
from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, Urn | ||
|
||
|
||
def transform_parsing_result_to_in_tables_schemas( | ||
parsing_result: SqlParsingResult, | ||
) -> Dict[Urn, Set[str]]: | ||
table_urn_to_schema_map: Dict[str, Set[str]] = ( | ||
{it: set() for it in parsing_result.in_tables} | ||
if parsing_result.in_tables | ||
else {} | ||
) | ||
|
||
if parsing_result.column_lineage: | ||
for cli in parsing_result.column_lineage: | ||
for upstream in cli.upstreams: | ||
if upstream.table in table_urn_to_schema_map: | ||
table_urn_to_schema_map[upstream.table].add(upstream.column) | ||
else: | ||
table_urn_to_schema_map[upstream.table] = {upstream.column} | ||
|
||
return table_urn_to_schema_map |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 67 additions & 0 deletions
67
metadata-ingestion/tests/unit/sql_parsing/test_sql_parsing_result_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from datahub.sql_parsing.sql_parsing_result_utils import ( | ||
transform_parsing_result_to_in_tables_schemas, | ||
) | ||
from datahub.sql_parsing.sqlglot_lineage import ( | ||
ColumnLineageInfo, | ||
ColumnRef, | ||
DownstreamColumnRef, | ||
SqlParsingResult, | ||
) | ||
|
||
|
||
def test_transform_parsing_result_to_in_tables_schemas__empty_parsing_result(): | ||
parsing_result = SqlParsingResult(in_tables=[], out_tables=[], column_lineage=None) | ||
|
||
in_tables_schema = transform_parsing_result_to_in_tables_schemas(parsing_result) | ||
assert not in_tables_schema | ||
|
||
|
||
def test_transform_parsing_result_to_in_tables_schemas__in_tables_only(): | ||
parsing_result = SqlParsingResult( | ||
in_tables=["table_urn1", "table_urn2", "table_urn3"], | ||
out_tables=[], | ||
column_lineage=None, | ||
) | ||
|
||
in_tables_schema = transform_parsing_result_to_in_tables_schemas(parsing_result) | ||
assert in_tables_schema == { | ||
"table_urn1": set(), | ||
"table_urn2": set(), | ||
"table_urn3": set(), | ||
} | ||
|
||
|
||
def test_transform_parsing_result_to_in_tables_schemas__in_tables_and_column_linage(): | ||
parsing_result = SqlParsingResult( | ||
in_tables=["table_urn1", "table_urn2", "table_urn3"], | ||
out_tables=[], | ||
column_lineage=[ | ||
ColumnLineageInfo( | ||
downstream=DownstreamColumnRef(column="out_col1"), | ||
upstreams=[ | ||
ColumnRef(table="table_urn1", column="col11"), | ||
], | ||
), | ||
ColumnLineageInfo( | ||
downstream=DownstreamColumnRef(column="out_col2"), | ||
upstreams=[ | ||
ColumnRef(table="table_urn2", column="col21"), | ||
ColumnRef(table="table_urn2", column="col22"), | ||
], | ||
), | ||
ColumnLineageInfo( | ||
downstream=DownstreamColumnRef(column="out_col3"), | ||
upstreams=[ | ||
ColumnRef(table="table_urn1", column="col12"), | ||
ColumnRef(table="table_urn2", column="col23"), | ||
], | ||
), | ||
], | ||
) | ||
|
||
in_tables_schema = transform_parsing_result_to_in_tables_schemas(parsing_result) | ||
assert in_tables_schema == { | ||
"table_urn1": {"col11", "col12"}, | ||
"table_urn2": {"col21", "col22", "col23"}, | ||
"table_urn3": set(), | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
import pytest | ||
|
||
from datahub.ingestion.source.tableau import TableauSource | ||
|
||
|
||
def test_tableau_source_unescapes_lt(): | ||
res = TableauSource._clean_tableau_query_parameters( | ||
"select * from t where c1 << 135" | ||
) | ||
|
||
assert res == "select * from t where c1 < 135" | ||
|
||
|
||
def test_tableau_source_unescapes_gt(): | ||
res = TableauSource._clean_tableau_query_parameters( | ||
"select * from t where c1 >> 135" | ||
) | ||
|
||
assert res == "select * from t where c1 > 135" | ||
|
||
|
||
def test_tableau_source_unescapes_gte(): | ||
res = TableauSource._clean_tableau_query_parameters( | ||
"select * from t where c1 >>= 135" | ||
) | ||
|
||
assert res == "select * from t where c1 >= 135" | ||
|
||
|
||
def test_tableau_source_unescapeslgte(): | ||
res = TableauSource._clean_tableau_query_parameters( | ||
"select * from t where c1 <<= 135" | ||
) | ||
|
||
assert res == "select * from t where c1 <= 135" | ||
|
||
|
||
def test_tableau_source_doesnt_touch_not_escaped(): | ||
res = TableauSource._clean_tableau_query_parameters( | ||
"select * from t where c1 < 135 and c2 > 15" | ||
) | ||
|
||
assert res == "select * from t where c1 < 135 and c2 > 15" | ||
|
||
|
||
TABLEAU_PARAMS = [ | ||
"<Parameters.MyParam>", | ||
"<Parameters.MyParam_1>", | ||
"<Parameters.My Param _ 1>", | ||
"<Parameters.My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>", | ||
"<[Parameters].MyParam>", | ||
"<[Parameters].MyParam_1>", | ||
"<[Parameters].My Param _ 1>", | ||
"<[Parameters].My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>", | ||
"<Parameters.[MyParam]>", | ||
"<Parameters.[MyParam_1]>", | ||
"<Parameters.[My Param _ 1]>", | ||
"<Parameters.[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<]>", | ||
"<[Parameters].[MyParam]>", | ||
"<[Parameters].[MyParam_1]>", | ||
"<[Parameters].[My Param _ 1]>", | ||
"<[Parameters].[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<]>", | ||
"<Parameters.[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>]>", | ||
"<[Parameters].[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>]>", | ||
] | ||
|
||
|
||
@pytest.mark.parametrize("p", TABLEAU_PARAMS) | ||
def test_tableau_source_cleanups_tableau_parameters_in_equi_predicates(p): | ||
assert ( | ||
TableauSource._clean_tableau_query_parameters( | ||
f"select * from t where c1 = {p} and c2 = {p} and c3 = 7" | ||
) | ||
== "select * from t where c1 = 1 and c2 = 1 and c3 = 7" | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("p", TABLEAU_PARAMS) | ||
def test_tableau_source_cleanups_tableau_parameters_in_lt_gt_predicates(p): | ||
assert ( | ||
TableauSource._clean_tableau_query_parameters( | ||
f"select * from t where c1 << {p} and c2<<{p} and c3 >> {p} and c4>>{p} or {p} >> c1 and {p}>>c2 and {p} << c3 and {p}<<c4" | ||
) | ||
== "select * from t where c1 < 1 and c2<1 and c3 > 1 and c4>1 or 1 > c1 and 1>c2 and 1 < c3 and 1<c4" | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("p", TABLEAU_PARAMS) | ||
def test_tableau_source_cleanups_tableau_parameters_in_lte_gte_predicates(p): | ||
assert ( | ||
TableauSource._clean_tableau_query_parameters( | ||
f"select * from t where c1 <<= {p} and c2<<={p} and c3 >>= {p} and c4>>={p} or {p} >>= c1 and {p}>>=c2 and {p} <<= c3 and {p}<<=c4" | ||
) | ||
== "select * from t where c1 <= 1 and c2<=1 and c3 >= 1 and c4>=1 or 1 >= c1 and 1>=c2 and 1 <= c3 and 1<=c4" | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("p", TABLEAU_PARAMS) | ||
def test_tableau_source_cleanups_tableau_parameters_in_join_predicate(p): | ||
assert ( | ||
TableauSource._clean_tableau_query_parameters( | ||
f"select * from t1 inner join t2 on t1.id = t2.id and t2.c21 = {p} and t1.c11 = 123 + {p}" | ||
) | ||
== "select * from t1 inner join t2 on t1.id = t2.id and t2.c21 = 1 and t1.c11 = 123 + 1" | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("p", TABLEAU_PARAMS) | ||
def test_tableau_source_cleanups_tableau_parameters_in_complex_expressions(p): | ||
assert ( | ||
TableauSource._clean_tableau_query_parameters( | ||
f"select myudf1(c1, {p}, c2) / myudf2({p}) > ({p} + 3 * {p} * c5) * {p} - c4" | ||
) | ||
== "select myudf1(c1, 1, c2) / myudf2(1) > (1 + 3 * 1 * c5) * 1 - c4" | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("p", TABLEAU_PARAMS) | ||
def test_tableau_source_cleanups_tableau_parameters_in_udfs(p): | ||
assert ( | ||
TableauSource._clean_tableau_query_parameters(f"select myudf({p}) from t") | ||
== "select myudf(1) from t" | ||
) |