Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Dec 13, 2024
2 parents 5cc3303 + eee49b3 commit 04efac4
Show file tree
Hide file tree
Showing 58 changed files with 1,997 additions and 285 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ const StructuredPropValues = ({ schemaFieldEntity, propColumn }: Props) => {
const entityRegistry = useEntityRegistry();

const property = schemaFieldEntity.structuredProperties?.properties?.find(
(prop) => prop.structuredProperty.urn === propColumn?.entity.urn,
(prop) => prop.structuredProperty.urn === propColumn?.entity?.urn,
);
const propRow = property ? mapStructuredPropertyToPropertyRow(property) : undefined;
const values = propRow?.values;
const isRichText = propRow?.dataType?.info.type === StdDataType.RichText;
const isRichText = propRow?.dataType?.info?.type === StdDataType.RichText;

const hasMoreValues = values && values.length > 2;
const displayedValues = hasMoreValues ? values.slice(0, 1) : values;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ const StructuredPropertyBadge = ({ structuredProperties }: Props) => {

if (!badgeStructuredProperty) return null;

const propertyValue = propRow?.values[0].value;
const relatedDescription = propRow?.structuredProperty.definition.allowedValues?.find(
const propertyValue = propRow?.values[0]?.value;
const relatedDescription = propRow?.structuredProperty?.definition?.allowedValues?.find(
(v) => getStructuredPropertyValue(v.value) === propertyValue,
)?.description;

Expand All @@ -56,7 +56,7 @@ const StructuredPropertyBadge = ({ structuredProperties }: Props) => {
<Text color="gray" size="sm" weight="bold">
Value
</Text>
<Text color="gray">{propRow?.values[0].value}</Text>
<Text color="gray">{propRow?.values[0]?.value}</Text>
</ValueContainer>
{relatedDescription && (
<ValueContainer>
Expand All @@ -79,7 +79,7 @@ const StructuredPropertyBadge = ({ structuredProperties }: Props) => {
>
<BadgeContainer>
<Pill
label={propRow?.values[0].value?.toString() || ''}
label={propRow?.values[0]?.value?.toString() || ''}
size="sm"
colorScheme="violet"
clickable={false}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ const SidebarStructuredPropsSection = ({ properties }: Props) => {
property,
currentProperties,
);
const isRichText = propertyRow?.dataType?.info.type === StdDataType.RichText;
const isRichText = propertyRow?.dataType?.info?.type === StdDataType.RichText;
const values = propertyRow?.values;
const hasMultipleValues = values && values.length > 1;
const propertyName = getDisplayName(property.entity as StructuredPropertyEntity);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function Form({ formUrn }: Props) {
const title = formAssociation?.form?.info?.name;
const associatedUrn = formAssociation?.associatedUrn;
const description = formAssociation?.form?.info?.description;
const owners = formAssociation?.form.ownership?.owners;
const owners = formAssociation?.form?.ownership?.owners;

return (
<TabWrapper>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ export default function EditStructuredPropertyModal({

return (
<Modal
title={`${isAddMode ? 'Add property' : 'Edit property'} ${structuredProperty?.definition.displayName}`}
title={`${isAddMode ? 'Add property' : 'Edit property'} ${structuredProperty?.definition?.displayName}`}
onCancel={closeModal}
open={isOpen}
width={650}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ const StructuredProperties = () => {

const searchAcrossEntities = data?.searchAcrossEntities;
const noOfProperties = searchAcrossEntities?.searchResults?.length;
const badgeProperty = searchAcrossEntities?.searchResults.find(
const badgeProperty = searchAcrossEntities?.searchResults?.find(
(prop) => (prop.entity as StructuredPropertyEntity).settings?.showAsAssetBadge,
)?.entity;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const ViewAdvancedOptions = ({ propEntity }: Props) => {
{propEntity && (
<RowContainer>
<StyledLabel>Qualified Name</StyledLabel>
<Text color="gray"> {propEntity?.definition.qualifiedName}</Text>
<Text color="gray"> {propEntity?.definition?.qualifiedName}</Text>
</RowContainer>
)}
</Collapse.Panel>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ const ViewStructuredPropsDrawer = ({

const selectedPropEntity = selectedProperty && (selectedProperty?.entity as StructuredPropertyEntity);

const allowedValues = selectedPropEntity?.definition.allowedValues;
const allowedValues = selectedPropEntity?.definition?.allowedValues;

const allowedTypes = selectedPropEntity?.definition.typeQualifier?.allowedTypes;
const allowedTypes = selectedPropEntity?.definition?.typeQualifier?.allowedTypes;

const propType = getValueTypeLabel(
selectedPropEntity.definition.valueType.urn,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => {
(status && <Typography.Text type="secondary">{getExecutionRequestSummaryText(status)}</Typography.Text>) ||
undefined;

const recipeJson = data?.executionRequest?.input.arguments?.find((arg) => arg.key === 'recipe')?.value;
const recipeJson = data?.executionRequest?.input?.arguments?.find((arg) => arg.key === 'recipe')?.value;
let recipeYaml: string;
try {
recipeYaml = recipeJson && YAML.stringify(JSON.parse(recipeJson), 8, 2).trim();
Expand Down
6 changes: 3 additions & 3 deletions datahub-web-react/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8921,9 +8921,9 @@ nanoevents@^5.1.13:
integrity sha512-JFAeG9fp0QZnRoESHjkbVFbZ9BkOXkkagUVwZVo/pkSX+Fq1VKlY+5og/8X9CYc6C7vje/CV+bwJ5M2X0+IY9Q==

nanoid@^3.3.6:
version "3.3.6"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c"
integrity sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==
version "3.3.8"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.8.tgz#b1be3030bee36aaff18bacb375e5cce521684baf"
integrity sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==

natural-compare-lite@^1.4.0:
version "1.4.0"
Expand Down
8 changes: 6 additions & 2 deletions docs-website/generateDocsDir.ts
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,10 @@ function markdown_add_slug(
// );
// }

function trim_anchor_link(url: string): string {
return url.replace(/#.+$/, "");
}

function new_url(original: string, filepath: string): string {
if (original.toLowerCase().startsWith(HOSTED_SITE_URL)) {
// For absolute links to the hosted docs site, we transform them into local ones.
Expand Down Expand Up @@ -313,7 +317,7 @@ function new_url(original: string, filepath: string): string {
}

// Now we assume this is a local reference.
const suffix = path.extname(original);
const suffix = path.extname(trim_anchor_link(original));
if (
suffix == "" ||
[
Expand All @@ -335,7 +339,7 @@ function new_url(original: string, filepath: string): string {
// A reference to a file or directory in the Github repo.
const relation = path.dirname(filepath);
const updated_path = path.normalize(`${relation}/${original}`);
const check_path = updated_path.replace(/#.+$/, "");
const check_path = trim_anchor_link(updated_path);
if (
!fs.existsSync(`../${check_path}`) &&
actually_in_sidebar(filepath) &&
Expand Down
1 change: 1 addition & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ module.exports = {
},

"docs/platform-instances",
"docs/lineage/sql_parsing",
"metadata-ingestion/docs/dev_guides/stateful",
"metadata-ingestion/docs/dev_guides/classification",
"metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source",
Expand Down
6 changes: 3 additions & 3 deletions docs-website/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8262,9 +8262,9 @@ multicast-dns@^7.2.5:
thunky "^1.0.2"

nanoid@^3.3.7:
version "3.3.7"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.7.tgz#d0c301a691bc8d54efa0a2226ccf3fe2fd656bd8"
integrity sha512-eSRppjcPIatRIMC1U6UngP8XFcz8MQWGQdt1MTBQ7NaAmvXDfvNxbvWV3x2y6CdEUciCSsDHDQZbhYaB8QEo2g==
version "3.3.8"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.8.tgz#b1be3030bee36aaff18bacb375e5cce521684baf"
integrity sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==

napi-build-utils@^1.0.1:
version "1.0.2"
Expand Down
2 changes: 1 addition & 1 deletion docs/lineage/airflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ Only the v2 plugin supports automatic lineage extraction. If you're using the v1
To automatically extract lineage information, the v2 plugin builds on top of Airflow's built-in [OpenLineage extractors](https://openlineage.io/docs/integrations/airflow/default-extractors).
As such, we support a superset of the default operators that Airflow/OpenLineage supports.

The SQL-related extractors have been updated to use [DataHub's SQL lineage parser](https://blog.datahubproject.io/extracting-column-level-lineage-from-sql-779b8ce17567), which is more robust than the built-in one and uses DataHub's metadata information to generate column-level lineage.
The SQL-related extractors have been updated to use [DataHub's SQL lineage parser](./sql_parsing.md), which is more robust than the built-in one and uses DataHub's metadata information to generate column-level lineage.

Supported operators:

Expand Down
63 changes: 63 additions & 0 deletions docs/lineage/sql_parsing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
title: SQL Parsing
---

# The DataHub SQL Parser

Many data platforms are built on top of SQL, which means deeply understanding SQL queries is critical for understanding column-level lineage, usage, and more.

DataHub's SQL parser is built on top of [sqlglot](https://github.com/tobymao/sqlglot) and adds a number of additional features to improve the accuracy of SQL parsing.

In our benchmarks, the DataHub SQL parser generates lineage with 97-99% accuracy and outperforms other SQL parsers by a wide margin.

We've published a blog post on some of the technical details of the parser: [Extracting Column Lineage from SQL Queries](https://blog.datahubproject.io/extracting-column-level-lineage-from-sql-779b8ce17567).

## Built-in SQL Parsing Support

If you're using a tool that DataHub already [integrates with](https://datahubproject.io/integrations), check the documentation for that specific integration.
Most of our integrations, including Snowflake, BigQuery, Redshift, dbt, Looker, PowerBI, Airflow, etc, use the SQL parser to generate column-level lineage and usage statistics.

If you’re using a different database system for which we don’t support column-level lineage out of the box, but you do have a database query log available, the [SQL queries](../generated/ingestion/sources/sql-queries.md) connector can generate column-level lineage and table/column usage statistics from the query log.

## SDK Support

Our SDK provides a [`DataHubGraph.parse_sql_lineage()`](../../python-sdk/clients.md#datahub.ingestion.graph.client.DataHubGraph.parse_sql_lineage) method for programmatically parsing SQL queries.

The resulting object contains a `sql_parsing_result.debug_info.confidence_score` field, which is a 0-1 value indicating the confidence of the parser.

There are also a number of utilities in the `datahub.sql_parsing` module. The `SqlParsingAggregator` is particularly useful, as it can also resolve lineage across temp tables and table renames/swaps.
Note that these utilities are not officially part of the DataHub SDK and hence do not have the same level of stability and support as the rest of the SDK.

## Capabilities

### Supported

- Table-level lineage for `SELECT`, `CREATE`, `INSERT`, `UPDATE`, `DELETE`, and `MERGE` statements
- Column-level lineage for `SELECT` (including `SELECT INTO`), `CREATE VIEW`, `CREATE TABLE AS SELECT` (CTAS), `INSERT`, and `UPDATE` statements
- Subqueries
- CTEs
- `UNION ALL` constructs - will merge lineage across the clauses of the `UNION`
- `SELECT *` and similar expressions will automatically be expanded with the table schemas registered in DataHub. This includes support for platform instances.
- Automatic handling for systems where table and column names are case insensitive. Generally requires that `convert_urns_to_lowercase` is enabled when the corresponding table schemas were ingested into DataHub.
- Specifically, we'll do fuzzy matching against the table names and schemas to resolve the correct URNs. We do not support having multiple tables/columns that only differ in casing.
- For BigQuery, sharded table suffixes will automatically be normalized. For example, `proj.dataset.table_20230616` will be normalized to `proj.dataset.table_yyyymmdd`. This matches the behavior of our BigQuery ingestion connector, and hence will result in lineage linking up correctly.

### Not supported

- Scalar `UDFs` - We will generate lineage pointing at the columns that are inputs to the UDF, but will not be able to understand the UDF itself.
- Tabular `UDFs`
- `json_extract` and similar functions
- `UNNEST` - We will do a best-effort job, but cannot reliably generate column-level lineage in the presence of `UNNEST` constructs.
- Structs - We will do a best-effort attempt to resolve struct subfields, but it is not guaranteed. This will only impact column-level lineage.
- Snowflake's multi-table inserts
- Multi-statement SQL / SQL scripting

### Limitations

- We only support the 20+ SQL dialects supported by the underlying [sqlglot](https://github.com/tobymao/sqlglot) library.
- There's a few SQL syntaxes that we don't support yet, but intend to support in the future.
- `INSERT INTO (col1_new, col2_new) SELECT col1_old, col2_old FROM ...`. We only support `INSERT INTO` statements that either (1) don't specify a column list, or (2) specify a column list that matches the columns in the `SELECT` clause.
- `MERGE INTO` statements - We don't generate column-level lineage for these.
- In cases where the table schema information in DataHub is outdated or otherwise incorrect, we may not be able to generate accurate column-level lineage.
- We trip over BigQuery queries that use the `_partitiontime` and `_partitiondate` pseudo-columns with a table name prefix e.g. `my_table._partitiontime` fails. However, unqualified references like `_partitiontime` and `_partitiondate` will be fine.
- We do not consider columns referenced in `WHERE`, `GROUP BY`, `ORDER BY`, etc. clauses to be part of lineage. For example, `SELECT col1, col2 FROM upstream_table WHERE col3 = 3` will not generate any lineage related to `col3`.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.linkedin.metadata.entity;

import com.linkedin.metadata.query.filter.Filter;
import com.linkedin.metadata.search.ScrollResult;
import java.util.Iterator;
import java.util.List;
import javax.annotation.Nonnull;
import lombok.Builder;

/**
* Fetches pages of structured properties which have been applied to an entity urn with a specified
* filter
*/
@Builder
public class GenericScrollIterator implements Iterator<ScrollResult> {
@Nonnull private final Filter filter;
@Nonnull private final List<String> entities;
@Nonnull private final SearchRetriever searchRetriever;
private int count;
@Builder.Default private String scrollId = null;
@Builder.Default private boolean started = false;

@Override
public boolean hasNext() {
return !started || scrollId != null;
}

@Override
public ScrollResult next() {
started = true;
ScrollResult result = searchRetriever.scroll(entities, filter, scrollId, count);
scrollId = result.getScrollId();
return result;
}
}
2 changes: 2 additions & 0 deletions li-utils/src/main/java/com/linkedin/metadata/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ public class Constants {
// Common
public static final String OWNERSHIP_ASPECT_NAME = "ownership";

public static final String TIMESTAMP_MILLIS = "timestampMillis";

public static final String INSTITUTIONAL_MEMORY_ASPECT_NAME = "institutionalMemory";
public static final String DATA_PLATFORM_INSTANCE_ASPECT_NAME = "dataPlatformInstance";
public static final String BROWSE_PATHS_ASPECT_NAME = "browsePaths";
Expand Down
4 changes: 2 additions & 2 deletions metadata-ingestion/scripts/docgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,7 @@ def generate(
<img width="80%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/lineage/lineage-tab.png" />
</p>
By default, The UI shows the latest version of the lineage. The time picker can be used to filter out edges within the latest version to exclude those that were last updated outside of the time window. Selecting time windows in the patch will not show you historical lineages. It will only filter the view of the latest version of the lineage.
By default, the UI shows the latest version of the lineage. The time picker can be used to filter out edges within the latest version to exclude those that were last updated outside of the time window. Selecting time windows in the patch will not show you historical lineages. It will only filter the view of the latest version of the lineage.
<p align="center">
<img width="80%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/lineage/lineage-view.png" />
Expand Down Expand Up @@ -969,7 +969,7 @@ def generate(
## Lineage Support
DataHub supports **[automatic table- and column-level lineage detection](#automatic-lineage-extraction-support)** from BigQuery, Snowflake, dbt, Looker, PowerBI, and 20+ modern data tools.
For data tools with limited native lineage tracking, **DataHub's SQL Parser** detects lineage with 9799% accuracy, ensuring teams will have high quality lineage graphs across all corners of their data stack.
For data tools with limited native lineage tracking, [**DataHub's SQL Parser**](../../lineage/sql_parsing.md) detects lineage with 97-99% accuracy, ensuring teams will have high quality lineage graphs across all corners of their data stack.
### Types of Lineage Connections
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def is_dataset_pattern_allowed(
) -> bool:
if not dataset_type or not dataset_name:
return True
dataset_params = dataset_name.split(".")
if dataset_type.lower() not in (
SnowflakeObjectDomain.TABLE,
SnowflakeObjectDomain.EXTERNAL_TABLE,
Expand All @@ -131,6 +130,7 @@ def is_dataset_pattern_allowed(
if _is_sys_table(dataset_name):
return False

dataset_params = _split_qualified_name(dataset_name)
if len(dataset_params) != 3:
self.structured_reporter.info(
title="Unexpected dataset pattern",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1383,8 +1383,7 @@ def _query_urn(cls, query_id: QueryId) -> str:
return QueryUrn(query_id).urn()

@classmethod
def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str:
composed_of_queries = list(composed_of_queries)
def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
combined = json.dumps(composed_of_queries)
return f"composite_{generate_hash(combined)}"

Expand Down
10 changes: 8 additions & 2 deletions metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _expression_to_string(
# Remove /* */ comments.
re.compile(r"/\*.*?\*/", re.DOTALL): "",
# Remove -- comments.
re.compile(r"--.*$"): "",
re.compile(r"--.*$", re.MULTILINE): "",
# Replace all runs of whitespace with a single space.
re.compile(r"\s+"): " ",
# Remove leading and trailing whitespace and trailing semicolons.
Expand All @@ -131,10 +131,16 @@ def _expression_to_string(
# Replace anything that looks like a string with a placeholder.
re.compile(r"'[^']*'"): "?",
# Replace sequences of IN/VALUES with a single placeholder.
re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)",
# The r" ?" makes it more robust to uneven spacing.
re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
# Normalize parenthesis spacing.
re.compile(r"\( "): "(",
re.compile(r" \)"): ")",
# Fix up spaces before commas in column lists.
# e.g. "col1 , col2" -> "col1, col2"
# e.g. "col1,col2" -> "col1, col2"
re.compile(r"\b ,"): ",",
re.compile(r"\b,\b"): ", ",
}
_TABLE_NAME_NORMALIZATION_RULES = {
# Replace UUID-like strings with a placeholder (both - and _ variants).
Expand Down
Loading

0 comments on commit 04efac4

Please sign in to comment.