diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index a1bfc4d39a370..e910449ed5870 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -7,6 +7,7 @@ on: paths: - "metadata-ingestion/**" - "metadata-models/**" + - "docs/**" - "docs-website/**" push: branches: @@ -14,6 +15,7 @@ on: paths: - "metadata-ingestion/**" - "metadata-models/**" + - "docs/**" - "docs-website/**" # release: # types: [published, edited] diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml index fc6bdb856816f..4fdb4a5b92e4d 100644 --- a/.github/workflows/pr-labeler.yml +++ b/.github/workflows/pr-labeler.yml @@ -43,7 +43,9 @@ jobs: "gaurav2733", "dushayntAW", "AvaniSiddhapuraAPT", - "akarsh991" + "akarsh991", + "shubhamjagtap639", + "mayurinehate" ]'), github.actor ) diff --git a/build.gradle b/build.gradle index 833dbaeb21d94..9c18ca62fb3c1 100644 --- a/build.gradle +++ b/build.gradle @@ -107,7 +107,7 @@ project.ext.externalDependency = [ 'avro': 'org.apache.avro:avro:1.11.3', 'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3', 'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17', - 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:1.1.9', + 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.0.3', 'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.13', 'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.2', 'awsRds':'software.amazon.awssdk:rds:2.18.24', diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 782f9a05dfb25..8ee07db0fde30 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -69,9 +69,7 @@ dependencies { runtimeOnly externalDependency.mysqlConnector runtimeOnly externalDependency.postgresql - implementation(externalDependency.awsMskIamAuth) { - exclude group: 'software.amazon.awssdk', module: 'third-party-jackson-core' - } + implementation externalDependency.awsMskIamAuth annotationProcessor externalDependency.lombok annotationProcessor externalDependency.picocli diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx index 26228e8c44515..2d93f3cc73470 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx @@ -106,6 +106,8 @@ type Props = { shouldRefetch?: boolean; resetShouldRefetch?: () => void; applyView?: boolean; + onLineageClick?: () => void; + isLineageTab?: boolean; }; export const EmbeddedListSearch = ({ @@ -134,6 +136,8 @@ export const EmbeddedListSearch = ({ shouldRefetch, resetShouldRefetch, applyView = false, + onLineageClick, + isLineageTab = false, }: Props) => { const { shouldRefetchEmbeddedListSearch, setShouldRefetchEmbeddedListSearch } = useEntityContext(); // Adjust query based on props @@ -143,7 +147,6 @@ export const EmbeddedListSearch = ({ unionType, filters, }; - const finalFilters = (fixedFilters && mergeFilterSets(fixedFilters, baseFilters)) || generateOrFilters(unionType, filters); @@ -191,6 +194,12 @@ export const EmbeddedListSearch = ({ fetchPolicy: 'cache-first', }); + const [serverError, setServerError] = useState(undefined); + + useEffect(() => { + setServerError(error); + }, [error]); + useEffect(() => { if (shouldRefetch && resetShouldRefetch) { refetch({ @@ -282,9 +291,18 @@ export const EmbeddedListSearch = ({ }); } + const isServerOverloadError = [503, 500, 504].includes(serverError?.networkError?.response?.status); + + const onClickLessHops = () => { + setServerError(undefined); + onChangeFilters(defaultFilters); + }; + + const ErrorMessage = () => ; + return ( - {error && } + {!isLineageTab ? error && : serverError && !isServerOverloadError && } onChangeQuery(addFixedQuery(q, fixedQuery as string, emptySearchQuery as string))} placeholderText={placeholderText} @@ -303,6 +321,10 @@ export const EmbeddedListSearch = ({ /> void; entityAction?: React.FC; applyView?: boolean; + isServerOverloadError?: any; + onClickLessHops?: () => void; + onLineageClick?: () => void; + isLineageTab?: boolean; } export const EmbeddedListSearchResults = ({ @@ -104,6 +122,10 @@ export const EmbeddedListSearchResults = ({ setNumResultsPerPage, entityAction, applyView, + isServerOverloadError, + onClickLessHops, + onLineageClick, + isLineageTab = false, }: Props) => { const pageStart = searchResponse?.start || 0; const pageSize = searchResponse?.count || 0; @@ -131,7 +153,19 @@ export const EmbeddedListSearchResults = ({ } /> )} - {!loading && ( + {isLineageTab && !loading && isServerOverloadError && ( + + Data is too large. Please use + + visualize lineage + + or see less hops by clicking + + here + + + )} + {!loading && !isServerOverloadError && ( void; applyView?: boolean; + onLineageClick?: () => void; + isLineageTab?: boolean; }; export const EmbeddedListSearchSection = ({ @@ -69,6 +71,8 @@ export const EmbeddedListSearchSection = ({ shouldRefetch, resetShouldRefetch, applyView, + onLineageClick, + isLineageTab }: Props) => { const history = useHistory(); const location = useLocation(); @@ -155,6 +159,8 @@ export const EmbeddedListSearchSection = ({ shouldRefetch={shouldRefetch} resetShouldRefetch={resetShouldRefetch} applyView={applyView} + onLineageClick={onLineageClick} + isLineageTab={isLineageTab} /> ); }; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Lineage/ImpactAnalysis.tsx b/datahub-web-react/src/app/entity/shared/tabs/Lineage/ImpactAnalysis.tsx index ce5a1598a00ec..4f1c5bb98807d 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Lineage/ImpactAnalysis.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Lineage/ImpactAnalysis.tsx @@ -13,6 +13,8 @@ type Props = { skipCache?: boolean; setSkipCache?: (skipCache: boolean) => void; resetShouldRefetch?: () => void; + onLineageClick?: () => void; + isLineageTab?: boolean; }; export const ImpactAnalysis = ({ @@ -24,6 +26,8 @@ export const ImpactAnalysis = ({ skipCache, setSkipCache, resetShouldRefetch, + onLineageClick, + isLineageTab }: Props) => { const finalStartTimeMillis = startTimeMillis || undefined; const finalEndTimeMillis = endTimeMillis || undefined; @@ -49,6 +53,8 @@ export const ImpactAnalysis = ({ defaultFilters={[{ field: 'degree', values: ['1'] }]} shouldRefetch={shouldRefetch} resetShouldRefetch={resetShouldRefetch} + onLineageClick={onLineageClick} + isLineageTab={isLineageTab} /> ); }; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Lineage/LineageTab.tsx b/datahub-web-react/src/app/entity/shared/tabs/Lineage/LineageTab.tsx index bbc86d49404a6..a5debe43d4f0f 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Lineage/LineageTab.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Lineage/LineageTab.tsx @@ -181,6 +181,8 @@ export const LineageTab = ({ ', }, { - href: "https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/", - label: "0.10.5", + type: 'html', + value: '', + }, + { + value: ` + 0.11.0 + + + `, + type: "html", + }, + { + value: ` + 0.10.5 + + + `, + type: "html", }, ], }, diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 13bda5d735f3e..2b60906b794a2 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -173,6 +173,7 @@ module.exports = { }, { "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_16", "docs/managed-datahub/release-notes/v_0_2_15", "docs/managed-datahub/release-notes/v_0_2_14", "docs/managed-datahub/release-notes/v_0_2_13", diff --git a/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss b/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss index 61739d5b6922c..69558d986ada9 100644 --- a/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss +++ b/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss @@ -1,11 +1,15 @@ +@media (min-width: 997px) and (max-width: 1465px) { + .feature { + min-height: 20rem !important; + max-height: 30rem !important; + } +} .feature { flex-direction: row; - padding: 1.75rem; color: var(--ifm-hero-text-color); margin: 0rem 2rem 1rem 0rem; - min-height: 14rem; - max-height: 15rem; - overflow: hidden; + min-height: 15rem; + max-height: 15rem; text-decoration: none !important; img { @@ -36,3 +40,4 @@ border-color: var(--ifm-color-primary); } } + diff --git a/docs-website/src/pages/docs/_components/FeatureCard/index.jsx b/docs-website/src/pages/docs/_components/FeatureCard/index.jsx index 407e8eb401987..8fb24493e50e9 100644 --- a/docs-website/src/pages/docs/_components/FeatureCard/index.jsx +++ b/docs-website/src/pages/docs/_components/FeatureCard/index.jsx @@ -8,7 +8,7 @@ const FeatureCard = ({icon, title, description, to}) => { return (
-
+
{icon} {title} → {description} diff --git a/docs-website/src/pages/docs/_components/QuickstartCard/index.jsx b/docs-website/src/pages/docs/_components/QuickstartCard/index.jsx index b4e3895fa40e7..d23901506dcce 100644 --- a/docs-website/src/pages/docs/_components/QuickstartCard/index.jsx +++ b/docs-website/src/pages/docs/_components/QuickstartCard/index.jsx @@ -9,9 +9,13 @@ const QuickstartCard = ({ icon, title, to, color, fontColor }) => { return (
- -
- {title} → +
+ +
+
+ {title} → +
+
diff --git a/docs-website/src/pages/docs/_components/QuickstartCard/quickstartcard.module.scss b/docs-website/src/pages/docs/_components/QuickstartCard/quickstartcard.module.scss index fd35a4b777c99..70515919060e6 100644 --- a/docs-website/src/pages/docs/_components/QuickstartCard/quickstartcard.module.scss +++ b/docs-website/src/pages/docs/_components/QuickstartCard/quickstartcard.module.scss @@ -2,47 +2,34 @@ flex-direction: row; height: 10rem; flex-shrink: 0; - padding: 3rem; color: var(--ifm-text-color); margin: 0rem 2rem 1rem 0rem; min-height: calc(100% - 1rem); text-decoration: none !important; - img { width: 3rem; height: 3rem; - margin: auto 1rem; + margin: auto; } svg { width: 1.5rem; height: 1.5rem; margin-right: 0.75rem; } - strong, - span { - display: block; - margin-bottom: 0.25rem; - } - strong { - font-weight: 600; - padding: auto 0; - } - span { - font-size: 0.875rem; - line-height: 1.25em; - } &:hover { border-color: var(--ifm-color-primary); } - .quickstart-text { - margin: auto 0; - } - } -.quickstart-text { - margin: auto 0; +.card_content { + display: flex; + margin: 0 auto; +} + +.card_title { + padding-left: 1rem; + font-weight: 600; } \ No newline at end of file diff --git a/docs-website/src/pages/docs/_components/QuickstartCards/quickstartcards.module.scss b/docs-website/src/pages/docs/_components/QuickstartCards/quickstartcards.module.scss index 4fbbc4583d662..833ec97b15ca3 100644 --- a/docs-website/src/pages/docs/_components/QuickstartCards/quickstartcards.module.scss +++ b/docs-website/src/pages/docs/_components/QuickstartCards/quickstartcards.module.scss @@ -15,12 +15,9 @@ height: 1.5rem; margin-right: 0.75rem; } - strong, - span { + strong { display: block; margin-bottom: 0.25rem; - } - strong { font-weight: 600; } diff --git a/docs-website/src/pages/docs/index.js b/docs-website/src/pages/docs/index.js index 11f1b3344a3d8..3f123e7b488ba 100644 --- a/docs-website/src/pages/docs/index.js +++ b/docs-website/src/pages/docs/index.js @@ -1,288 +1,8 @@ -import React from "react"; -import Layout from "@theme/Layout"; -import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; -import SearchBar from "./_components/SearchBar"; -import QuickLinkCards from "./_components/QuickLinkCards"; -import GuideList from "./_components/GuideList"; +import React from 'react'; +import { Redirect } from '@docusaurus/router'; -import { - FolderTwoTone, - BookTwoTone, - TagsTwoTone, - ApiTwoTone, - SearchOutlined, - CompassTwoTone, - NodeExpandOutlined, - CheckCircleTwoTone, - SafetyCertificateTwoTone, - LockTwoTone, - SlackOutlined, - HistoryOutlined, - InteractionOutlined, - GlobalOutlined, - FileTextOutlined, -} from "@ant-design/icons"; +const Home = () => { + return ; +}; -//quickLinkCards -import { - ThunderboltTwoTone, - DeploymentUnitOutlined, - SyncOutlined, - CodeTwoTone, - QuestionCircleTwoTone, - SlidersTwoTone, - HeartTwoTone, -} from "@ant-design/icons"; - -const deploymentGuideContent = [ - { - title: "Managed DataHub", - platformIcon: "acryl", - to: "docs/managed-datahub/managed-datahub-overview", - }, - { - title: "Docker", - platformIcon: "docker", - to: "docs/docker", - }, - // { - // title: "AWS ECS", - // platformIcon: "amazon-ecs", - // to: "docs/deploy/aws", - // }, - { - title: "AWS", - platformIcon: "amazon-eks", - to: "docs/deploy/aws", - }, - { - title: "GCP", - platformIcon: "google-cloud", - to: "docs/deploy/gcp", - }, -]; - -const ingestionGuideContent = [ - { - title: "Snowflake", - platformIcon: "snowflake", - to: "docs/generated/ingestion/sources/snowflake", - }, - { - title: "Looker", - platformIcon: "looker", - to: "docs/generated/ingestion/sources/looker", - }, - { - title: "Redshift", - platformIcon: "redshift", - to: "docs/generated/ingestion/sources/redshift", - }, - { - title: "Hive", - platformIcon: "hive", - to: "docs/generated/ingestion/sources/hive", - }, - { - title: "BigQuery", - platformIcon: "bigquery", - to: "docs/generated/ingestion/sources/bigquery", - }, - { - title: "dbt", - platformIcon: "dbt", - to: "docs/generated/ingestion/sources/dbt", - }, - { - title: "Athena", - platformIcon: "athena", - to: "docs/generated/ingestion/sources/athena", - }, - { - title: "PostgreSQL", - platformIcon: "postgres", - to: "docs/generated/ingestion/sources/postgres", - }, -]; - -const featureGuideContent = [ - { title: "Domains", icon: , to: "docs/domains" }, - { - title: "Glossary Terms", - icon: , - to: "docs/glossary/business-glossary", - }, - { title: "Tags", icon: , to: "docs/tags" }, - { - title: "Ingestion", - icon: , - to: "docs/ui-ingestion", - }, - { title: "Search", icon: , to: "docs/how/search" }, - // { title: "Browse", icon: , to: "/docs/quickstart" }, - { - title: "Lineage Impact Analysis", - icon: , - to: "docs/act-on-metadata/impact-analysis", - }, - { - title: "Metadata Tests", - icon: , - to: "docs/tests/metadata-tests", - }, - { - title: "Approval Flows", - icon: , - to: "docs/managed-datahub/approval-workflows", - }, - { - title: "Personal Access Tokens", - icon: , - to: "docs/authentication/personal-access-tokens", - }, - { - title: "Slack Notifications", - icon: , - to: "docs/managed-datahub/saas-slack-setup", - }, - { - title: "Schema History", - icon: , - to: "docs/schema-history", - }, -]; - -const quickLinkContent = [ - { - title: "Get Started", - icon: , - description: "Details on how to get DataHub up and running", - to: "/docs/quickstart", - }, - { - title: "Ingest Metadata", - icon: , - description: "Details on how to get Metadata loaded into DataHub", - to: "/docs/metadata-ingestion", - }, - { - title: "API", - icon: , - description: "Details on how to utilize Metadata programmatically", - to: "docs/api/datahub-apis", - }, - { - title: "Act on Metadata", - icon: , - description: "Step-by-step guides for acting on Metadata Events", - to: "docs/act-on-metadata", - }, - { - title: "Developer Guides", - icon: , - description: "Interact with DataHub programmatically", - to: "/docs/api/datahub-apis", - }, - { - title: "Feature Guides", - icon: , - description: "Step-by-step guides for making the most of DataHub", - to: "/docs/how/search", - }, - { - title: "Deployment Guides", - icon: , - description: "Step-by-step guides for deploying DataHub to production", - to: "/docs/deploy/aws", - }, - { - title: "Join the Community", - icon: , - description: "Collaborate, learn, and grow with us", - to: "/docs/slack", - }, -]; - -const gitLinkContent = [ - { - title: "datahub", - icon: , - to: "https://github.com/datahub-project/datahub", - }, - { - title: "datahub-actions", - icon: , - to: "https://github.com/acryldata/datahub-actions", - }, - { - title: "datahub-helm", - icon: , - to: "https://github.com/acryldata/datahub-helm", - }, - { - title: "meta-world", - icon: , - to: "https://github.com/acryldata/meta-world", - }, - { - title: "business-glossary-sync-action", - icon: , - to: "https://github.com/acryldata/business-glossary-sync-action", - }, - { - title: "dbt-impact-action", - icon: , - to: "https://github.com/acryldata/dbt-impact-action", - }, -]; - -function Docs() { - const context = useDocusaurusContext(); - const { siteConfig = {} } = context; - - return ( - -
-
-
-
-

Documentation

-

- Guides and tutorials for everything DataHub. -

- -
-
- - - - - -
-
-
- ); -} - -export default Docs; +export default Home; \ No newline at end of file diff --git a/docs/api/datahub-apis.md b/docs/api/datahub-apis.md index e9942cb19a4c9..252c96cab56c3 100644 --- a/docs/api/datahub-apis.md +++ b/docs/api/datahub-apis.md @@ -59,33 +59,61 @@ Get started with our Rest.li API DataHub supports several APIs, each with its own unique usage and format. Here's an overview of what each API can do. -> Last Updated : Apr 8 2023 - -| Feature | GraphQL | Python SDK | OpenAPI | -| ------------------------------------------------------- | ---------------------------------------------------------------------------- | ---------------------------------------------------------------------------- | ------- | -| Create a dataset | 🚫 | ✅ [[Guide]](/docs/api/tutorials/datasets.md) | ✅ | -| Delete a dataset (Soft delete) | ✅ [[Guide]](/docs/api/tutorials/datasets.md#delete-dataset) | ✅ [[Guide]](/docs/api/tutorials/datasets.md#delete-dataset) | ✅ | -| Delete a dataset (Hard delele) | 🚫 | ✅ [[Guide]](/docs/api/tutorials/datasets.md#delete-dataset) | ✅ | -| Search a dataset | ✅ | ✅ | ✅ | -| Create a tag | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ | -| Read a tag | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ | -| Add tags to a dataset | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ | -| Add tags to a column of a dataset | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ | -| Remove tags from a dataset | ✅ [[Guide]](/docs/api/tutorials/tags.md) | ✅ [[Guide]](/docs/api/tutorials/tags.md#add-tags) | ✅ | -| Create glossary terms | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ | -| Read terms from a dataset | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ | -| Add terms to a column of a dataset | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ | -| Add terms to a dataset | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ [[Guide]](/docs/api/tutorials/terms.md) | ✅ | -| Create domains | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ | -| Read domains | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ | -| Add domains to a dataset | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ | -| Remove domains from a dataset | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ [[Guide]](/docs/api/tutorials/domains.md) | ✅ | -| Crate users and groups | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ | -| Read owners of a dataset | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ | -| Add owner to a dataset | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ | -| Remove owner from a dataset | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ | -| Add lineage | ✅ [[Guide]](/docs/api/tutorials/lineage.md) | ✅ [[Guide]](/docs/api/tutorials/lineage.md) | ✅ | -| Add column level(Fine Grained) lineage | 🚫 | ✅ | ✅ | -| Add documentation(description) to a column of a dataset | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-column) | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-column) | ✅ | -| Add documentation(description) to a dataset | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-dataset) | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-dataset) | ✅ | -| Add / Remove / Replace custom properties on a dataset | 🚫 [[Guide]](/docs/api/tutorials/custom-properties.md) | ✅ [[Guide]](/docs/api/tutorials/custom-properties.md) | ✅ | +> Last Updated : Feb 16 2024 + +| Feature | GraphQL | Python SDK | OpenAPI | +|------------------------------------|------------------------------------------------------------------------------|------------------------------------------------------------------------------|---------| +| Create a Dataset | 🚫 | ✅ [[Guide]](/docs/api/tutorials/datasets.md) | ✅ | +| Delete a Dataset (Soft Delete) | ✅ [[Guide]](/docs/api/tutorials/datasets.md#delete-dataset) | ✅ [[Guide]](/docs/api/tutorials/datasets.md#delete-dataset) | ✅ | +| Delete a Dataset (Hard Delete) | 🚫 | ✅ [[Guide]](/docs/api/tutorials/datasets.md#delete-dataset) | ✅ | +| Search a Dataset | ✅ | ✅ | ✅ | +| Read a Dataset Deprecation | ✅ | ✅ | ✅ | +| Read Dataset Entities (V2) | ✅ | ✅ | ✅ | +| Create a Tag | ✅ [[Guide]](/docs/api/tutorials/tags.md#create-tags) | ✅ [[Guide]](/docs/api/tutorials/tags.md#create-tags) | ✅ | +| Read a Tag | ✅ [[Guide]](/docs/api/tutorials/tags.md#read-tags) | ✅ [[Guide]](/docs/api/tutorials/tags.md#read-tags) | ✅ | +| Add Tags to a Dataset | ✅ [[Guide]](/docs/api/tutorials/tags.md#add-tags-to-a-dataset) | ✅ [[Guide]](/docs/api/tutorials/tags.md#add-tags-to-a-dataset) | ✅ | +| Add Tags to a Column of a Dataset | ✅ [[Guide]](/docs/api/tutorials/tags.md#add-tags-to-a-column-of-a-dataset) | ✅ [[Guide]](/docs/api/tutorials/tags.md#add-tags-to-a-column-of-a-dataset) | ✅ | +| Remove Tags from a Dataset | ✅ [[Guide]](/docs/api/tutorials/tags.md#remove-tags) | ✅ [[Guide]](/docs/api/tutorials/tags.md#add-tags#remove-tags) | ✅ | +| Create Glossary Terms | ✅ [[Guide]](/docs/api/tutorials/terms.md#create-terms) | ✅ [[Guide]](/docs/api/tutorials/terms.md#create-terms) | ✅ | +| Read Terms from a Dataset | ✅ [[Guide]](/docs/api/tutorials/terms.md#read-terms) | ✅ [[Guide]](/docs/api/tutorials/terms.md#read-terms) | ✅ | +| Add Terms to a Column of a Dataset | ✅ [[Guide]](/docs/api/tutorials/terms.md#add-terms-to-a-column-of-a-dataset) | ✅ [[Guide]](/docs/api/tutorials/terms.md#add-terms-to-a-column-of-a-dataset) | ✅ | +| Add Terms to a Dataset | ✅ [[Guide]](/docs/api/tutorials/terms.md#add-terms-to-a-dataset) | ✅ [[Guide]](/docs/api/tutorials/terms.md#add-terms-to-a-dataset) | ✅ | +| Create Domains | ✅ [[Guide]](/docs/api/tutorials/domains.md#create-domain) | ✅ [[Guide]](/docs/api/tutorials/domains.md#create-domain) | ✅ | +| Read Domains | ✅ [[Guide]](/docs/api/tutorials/domains.md#read-domains) | ✅ [[Guide]](/docs/api/tutorials/domains.md#read-domains) | ✅ | +| Add Domains to a Dataset | ✅ [[Guide]](/docs/api/tutorials/domains.md#add-domains) | ✅ [[Guide]](/docs/api/tutorials/domains.md#add-domains) | ✅ | +| Remove Domains from a Dataset | ✅ [[Guide]](/docs/api/tutorials/domains.md#remove-domains) | ✅ [[Guide]](/docs/api/tutorials/domains.md#remove-domains) | ✅ | +| Create / Upsert Users | ✅ [[Guide]](/docs/api/tutorials/owners.md#upsert-users) | ✅ [[Guide]](/docs/api/tutorials/owners.md#upsert-users) | ✅ | +| Create / Upsert Group | ✅ [[Guide]](/docs/api/tutorials/owners.md#upsert-group) | ✅ [[Guide]](/docs/api/tutorials/owners.md#upsert-group) | ✅ | +| Read Owners of a Dataset | ✅ [[Guide]](/docs/api/tutorials/owners.md#read-owners) | ✅ [[Guide]](/docs/api/tutorials/owners.md#read-owners) | ✅ | +| Add Owner to a Dataset | ✅ [[Guide]](/docs/api/tutorials/owners.md#add-owners) | ✅ [[Guide]](/docs/api/tutorials/owners.md#add-owners#remove-owners) | ✅ | +| Remove Owner from a Dataset | ✅ [[Guide]](/docs/api/tutorials/owners.md#remove-owners) | ✅ [[Guide]](/docs/api/tutorials/owners.md) | ✅ | +| Add Lineage | ✅ [[Guide]](/docs/api/tutorials/lineage.md) | ✅ [[Guide]](/docs/api/tutorials/lineage.md#add-lineage) | ✅ | +| Add Column Level (Fine Grained) Lineage | 🚫 | ✅ [[Guide]](docs/api/tutorials/lineage.md#add-column-level-lineage) | ✅ | +| Add Documentation (Description) to a Column of a Dataset | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-column) | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-column) | ✅ | +| Add Documentation (Description) to a Dataset | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-dataset) | ✅ [[Guide]](/docs/api/tutorials/descriptions.md#add-description-on-dataset) | ✅ | +| Add / Remove / Replace Custom Properties on a Dataset | 🚫 | ✅ [[Guide]](/docs/api/tutorials/custom-properties.md) | ✅ | +| Add ML Feature to ML Feature Table | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#add-mlfeature-to-mlfeaturetable) | ✅ | +| Add ML Feature to MLModel | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#add-mlfeature-to-mlmodel) | ✅ | +| Add ML Group to MLFeatureTable | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#add-mlgroup-to-mlfeaturetable) | ✅ | +| Create MLFeature | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#create-mlfeature) | ✅ | +| Create MLFeatureTable | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#create-mlfeaturetable) | ✅ | +| Create MLModel | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#create-mlmodel) | ✅ | +| Create MLModelGroup | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#create-mlmodelgroup) | ✅ | +| Create MLPrimaryKey | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#create-mlprimarykey) | ✅ | +| Create MLFeatureTable | 🚫 | ✅ [[Guide]](/docs/api/tutorials/ml.md#create-mlfeaturetable)| ✅ | +| Read MLFeature | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlfeature) | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlfeature) | ✅ | +| Read MLFeatureTable | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlfeaturetable) | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlfeaturetable) | ✅ | +| Read MLModel | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlmodel) | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlmodel) | ✅ | +| Read MLModelGroup | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlmodelgroup) | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlmodelgroup) | ✅ | +| Read MLPrimaryKey | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlprimarykey) | ✅ [[Guide]](/docs/api/tutorials/ml.md#read-mlprimarykey) | ✅ | +| Create Data Product | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/create_dataproduct.py) | ✅ | +| Create Lineage Between Chart and Dashboard | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_chart_dashboard.py) | ✅ | +| Create Lineage Between Dataset and Chart | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_dataset_chart.py) | ✅ | +| Create Lineage Between Dataset and DataJob | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_dataset_job_dataset.py) | ✅ | +| Create Finegrained Lineage as DataJob for Dataset | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_emitter_datajob_finegrained.py) | ✅ | +| Create Finegrained Lineage for Dataset | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained.py) | ✅ | +| Create Dataset Lineage with Kafka | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_emitter_kafka.py) | ✅ | +| Create Dataset Lineage with MCPW & Rest Emitter | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py) | ✅ | +| Create Dataset Lineage with Rest Emitter | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_emitter_rest.py) | ✅ | +| Create DataJob with Dataflow | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_job_dataflow.py) [[Simple]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py) [[Verbose]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_verbose.py) | ✅ | +| Create Programmatic Pipeline | 🚫 | ✅ [[Code]](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/library/programatic_pipeline.py) | ✅ | \ No newline at end of file diff --git a/docs/features.md b/docs/features.md index 9ce85d83ee54a..cac822aba06c5 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,3 +1,9 @@ +--- +hide_title: true +slug: / +--- + + import QuickstartCards from '@site/src/pages/docs/_components/QuickstartCards'; import FeatureCardSection from '@site/src/pages/docs/_components/FeatureCardSection'; diff --git a/docs/managed-datahub/release-notes/v_0_2_16.md b/docs/managed-datahub/release-notes/v_0_2_16.md new file mode 100644 index 0000000000000..29d7aa762ec00 --- /dev/null +++ b/docs/managed-datahub/release-notes/v_0_2_16.md @@ -0,0 +1,16 @@ +# v0.2.16 +--- + +Release Availability Date +--- +18-Mar-2023 + +Recommended CLI/SDK +--- +- `v0.13.1` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.13.1 + +If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. + +## Release Changelog +--- +- Since `v0.2.15` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/92850ac55625f3fbee6cdd8699970b43c18a6f58...55bc955304c4c192c04a0393a47355a295f5770a have been pulled in. diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md index 04318d06bca71..be7e1e5013318 100644 --- a/metadata-ingestion/docs/dev_guides/classification.md +++ b/metadata-ingestion/docs/dev_guides/classification.md @@ -28,25 +28,25 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d ### Config Details -| Field | Required | Type | Description | Default | -| ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| confidence_level_threshold | | number | | 0.68 | -| strip_exclusion_formatting | | bool | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True | -| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None | -| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. | -| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | | -| info_types_config.`key`.exclude_name | | list[string] | Optional list of names to exclude from classification. | None | -| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | | -| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] | -| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | | -| info_types_config.`key`.description.regex | | Array of string | List of regex patterns the column description follows for the info type | ['.*'] | -| info_types_config.`key`.datatype | | DataTypeFactorConfig (see below for fields) | | | -| info_types_config.`key`.datatype.type | | Array of string | List of data types for the info type | ['.*'] | -| info_types_config.`key`.values | | ValuesFactorConfig (see below for fields) | | | -| info_types_config.`key`.values.prediction_type | ❓ (required if info_types_config.`key`.values is set) | string | | None | -| info_types_config.`key`.values.regex | | Array of string | List of regex patterns the column value follows for the info type | None | -| info_types_config.`key`.values.library | | Array of string | Library used for prediction | None | -| minimum_values_threshold | | number | Minimum number of non-null column values required to process `values` prediction factor. | 50 | +| Field | Required | Type | Description | Default | +| ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| confidence_level_threshold | | number | | 0.68 | +| strip_exclusion_formatting | | bool | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True | +| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None | +| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. | +| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | | +| info_types_config.`key`.exclude_name | | list[string] | Optional list of names to exclude from classification. | None | +| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | | +| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] | +| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | | +| info_types_config.`key`.description.regex | | Array of string | List of regex patterns the column description follows for the info type | ['.*'] | +| info_types_config.`key`.datatype | | DataTypeFactorConfig (see below for fields) | | | +| info_types_config.`key`.datatype.type | | Array of string | List of data types for the info type | ['.*'] | +| info_types_config.`key`.values | | ValuesFactorConfig (see below for fields) | | | +| info_types_config.`key`.values.prediction_type | ❓ (required if info_types_config.`key`.values is set) | string | | None | +| info_types_config.`key`.values.regex | | Array of string | List of regex patterns the column value follows for the info type | None | +| info_types_config.`key`.values.library | | Array of string | Library used for prediction | None | +| minimum_values_threshold | | number | Minimum number of non-null column values required to process `values` prediction factor. | 50 | | | ### Supported infotypes - `Email_Address` @@ -63,12 +63,20 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d - `IP_Address_v6` - `US_Driving_License_Number` - `Swift_Code` +- Regex based Custom InfoTypes -### Supported sources +## Supported sources -* snowflake +- All SQL sources -#### Example +## Future Work + +- Classification for nested columns (struct, array type) + + +## Examples + +### Basic ```yml source: @@ -94,7 +102,7 @@ source: - type: datahub ``` -#### Example with Advanced Configuration: Customizing configuration for supported info types +### Advanced Configuration: Customizing configuration for supported info types ```yml source: @@ -399,7 +407,7 @@ source: ``` -#### Example with Advanced Configuration: Specifying custom info type +### Advanced Configuration: Specifying Custom InfoType ```yml source: @@ -438,3 +446,10 @@ source: regex: - "(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\\d+" library: [] +``` + +## Additional Resources + +### DataHub Blog + +* [PII Classification just got easier with DataHub](https://blog.datahubproject.io/pii-classification-just-got-easier-with-datahub-6bab2b63abcb) \ No newline at end of file diff --git a/metadata-ingestion/examples/recipes/file_to_datahub-jobs-golden.dhub.yaml b/metadata-ingestion/examples/recipes/file_to_datahub-jobs-golden.dhub.yaml new file mode 100644 index 0000000000000..bdad337b607de --- /dev/null +++ b/metadata-ingestion/examples/recipes/file_to_datahub-jobs-golden.dhub.yaml @@ -0,0 +1,11 @@ +--- +# see https://datahubproject.io/docs/generated/ingestion/sources/file for complete documentation +source: + type: "file" + config: + filename: ./examples/test_examples/via_node_test_example_fivetran.json +# see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" diff --git a/metadata-ingestion/examples/test_examples/via_node_test_example_fivetran.json b/metadata-ingestion/examples/test_examples/via_node_test_example_fivetran.json new file mode 100644 index 0000000000000..886ad2e2005ca --- /dev/null +++ b/metadata-ingestion/examples/test_examples/via_node_test_example_fivetran.json @@ -0,0 +1,731 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "name": "postgres" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "paused": "False", + "sync_frequency": "1440", + "destination_id": "'interval_unconstitutional'" + }, + "name": "postgres", + "type": { + "string": "COMMAND" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [{ + "fieldPath": "id", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER(asdecimal=False)", + "recursive": false, + "isPartOfKey": true + }] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [{ + "fieldPath": "id", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER(asdecimal=False)", + "recursive": false, + "isPartOfKey": true + }] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD),field_bar)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } + }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD),field_foo_2)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD),field_bar)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, + +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:Shubham Jagtap", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "4c9a03d6-eded-4422-a46a-163266e58243", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1695191853000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:bigquery,test.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:bigquery,test.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1695191853000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1695191885000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1696343730000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:bigquery,test.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:bigquery,test.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343730000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343732000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SKIPPED", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "63c2fc85-600b-455f-9ba0-f576522465be", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1696343755000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:bigquery,test.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:bigquery,test.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343755000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343790000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 5873b7ac25c09..5570893b7d1df 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -99,7 +99,7 @@ sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==22.3.1.dev3", + "acryl-sqlglot==22.4.1.dev4", } classification_lib = { @@ -302,7 +302,8 @@ | { *sqlglot_lib, "google-cloud-datacatalog-lineage==0.2.2", - }, + } + | classification_lib, "clickhouse": sql_common | clickhouse_common, "clickhouse-usage": sql_common | usage_common | clickhouse_common, "datahub-lineage-file": set(), @@ -370,6 +371,8 @@ | redshift_common | usage_common | sqlglot_lib + | classification_lib + | {"db-dtypes"} # Pandas extension data types | {"cachetools"}, "s3": {*s3_base, *data_lake_profiling}, "gcs": {*s3_base, *data_lake_profiling}, diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 419ae5668292d..6c5db13608414 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -1,4 +1,7 @@ +import dataclasses +import json import logging +import pathlib import pprint import shutil import tempfile @@ -17,6 +20,7 @@ from datahub.ingestion.source.source_registry import source_registry from datahub.ingestion.transformer.transform_registry import transform_registry from datahub.telemetry import telemetry +from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList logger = logging.getLogger(__name__) @@ -339,3 +343,28 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None: f"Failed to validate pattern {pattern_dicts} in path {path_spec_key}" ) raise e + + +@check.command() +@click.argument("query-log-file", type=click.Path(exists=True, dir_okay=False)) +@click.option("--output", type=click.Path()) +def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None: + """Convert a sqlite db generated by the SqlParsingAggregator into a JSON.""" + + from datahub.sql_parsing.sql_parsing_aggregator import LoggedQuery + + assert dataclasses.is_dataclass(LoggedQuery) + + shared_connection = ConnectionWrapper(pathlib.Path(query_log_file)) + query_log = FileBackedList[LoggedQuery]( + shared_connection=shared_connection, tablename="stored_queries" + ) + logger.info(f"Extracting {len(query_log)} queries from {query_log_file}") + queries = [dataclasses.asdict(query) for query in query_log] + + if output: + with open(output, "w") as f: + json.dump(queries, f, indent=2, default=str) + logger.info(f"Extracted {len(queries)} queries to {output}") + else: + click.echo(json.dumps(queries, indent=2)) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 906a431666e17..d299f1009d51a 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -57,6 +57,7 @@ class SourceCapability(Enum): TAGS = "Extract Tags" SCHEMA_METADATA = "Schema Metadata" CONTAINERS = "Asset Containers" + CLASSIFICATION = "Classification" @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index c6c95e76d196f..c0de827b21131 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -1,16 +1,20 @@ import concurrent.futures import logging from dataclasses import dataclass, field +from functools import partial from math import ceil -from typing import Dict, Iterable, List, Optional +from typing import Callable, Dict, Iterable, List, Optional, Union from datahub_classify.helper_classes import ColumnInfo, Metadata from pydantic import Field from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.emitter.mce_builder import get_sys_time, make_term_urn, make_user_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.glossary.classifier import ClassificationConfig, Classifier from datahub.ingestion.glossary.classifier_registry import classifier_registry +from datahub.ingestion.source.sql.data_reader import DataReader from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, GlossaryTermAssociation, @@ -25,9 +29,12 @@ @dataclass class ClassificationReportMixin: + + num_tables_fetch_sample_values_failed: int = 0 + num_tables_classification_attempted: int = 0 num_tables_classification_failed: int = 0 - num_tables_classified: int = 0 + num_tables_classification_found: int = 0 info_types_detected: LossyDict[str, LossyList[str]] = field( default_factory=LossyDict @@ -99,8 +106,22 @@ def classify_schema_fields( self, dataset_name: str, schema_metadata: SchemaMetadata, - sample_data: Dict[str, list], + sample_data: Union[Dict[str, list], Callable[[], Dict[str, list]]], ) -> None: + + if not isinstance(sample_data, Dict): + try: + # TODO: In future, sample_data fetcher can be lazily called if classification + # requires values as prediction factor + sample_data = sample_data() + except Exception as e: + self.report.num_tables_fetch_sample_values_failed += 1 + logger.warning( + f"Failed to get sample values for dataset. Make sure you have granted SELECT permissions on dataset. {dataset_name}", + ) + sample_data = dict() + logger.debug("Error", exc_info=e) + column_infos = self.get_columns_to_classify( dataset_name, schema_metadata, sample_data ) @@ -137,7 +158,7 @@ def classify_schema_fields( ) if field_terms: - self.report.num_tables_classified += 1 + self.report.num_tables_classification_found += 1 self.populate_terms_in_schema_metadata(schema_metadata, field_terms) def update_field_terms( @@ -234,8 +255,11 @@ def get_columns_to_classify( ) continue - # TODO: Let's auto-skip passing sample_data for complex(array/struct) columns - # for initial rollout + # As a result of custom field path specification e.g. [version=2.0].[type=struct].[type=struct].service' + # Sample values for a nested field (an array , union or struct) are not read / passed in classifier correctly. + # TODO: Fix this behavior for nested fields. This would probably involve: + # 1. Preprocessing field path spec v2 back to native field representation. (without [*] constructs) + # 2. Preprocessing retrieved structured sample data to pass in sample values correctly for nested fields. column_infos.append( ColumnInfo( @@ -256,3 +280,47 @@ def get_columns_to_classify( ) return column_infos + + +def classification_workunit_processor( + table_wu_generator: Iterable[MetadataWorkUnit], + classification_handler: ClassificationHandler, + data_reader: Optional[DataReader], + table_id: List[str], + data_reader_kwargs: dict = {}, +) -> Iterable[MetadataWorkUnit]: + table_name = ".".join(table_id) + if not classification_handler.is_classification_enabled_for_table(table_name): + yield from table_wu_generator + for wu in table_wu_generator: + maybe_schema_metadata = wu.get_aspect_of_type(SchemaMetadata) + if maybe_schema_metadata: + try: + classification_handler.classify_schema_fields( + table_name, + maybe_schema_metadata, + ( + partial( + data_reader.get_sample_data_for_table, + table_id, + classification_handler.config.classification.sample_size + * 1.2, + **data_reader_kwargs, + ) + if data_reader + else dict() + ), + ) + yield MetadataChangeProposalWrapper( + aspect=maybe_schema_metadata, entityUrn=wu.get_urn() + ).as_workunit( + is_primary_source=wu.is_primary_source, + ) + except Exception as e: + logger.debug( + f"Failed to classify table columns for {table_name} due to error -> {e}", + exc_info=e, + ) + yield wu + else: + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index bcc0aa50ed22e..8452399bddf5d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -35,11 +35,16 @@ TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationHandler, + classification_workunit_processor, +) from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigqueryTableIdentifier, BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_data_reader import BigQueryDataReader from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( unquote_and_decode_unicode_escape_seq, ) @@ -167,6 +172,11 @@ def cleanup(config: BigQueryV2Config) -> None: "Optionally enabled via `stateful_ingestion.remove_stale_metadata`", supported=True, ) +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types BIGQUERY_FIELD_TYPE_MAPPINGS: Dict[ @@ -214,6 +224,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): super(BigqueryV2Source, self).__init__(config, ctx) self.config: BigQueryV2Config = config self.report: BigQueryV2Report = BigQueryV2Report() + self.classification_handler = ClassificationHandler(self.config, self.report) self.platform: str = "bigquery" BigqueryTableIdentifier._BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = ( @@ -227,6 +238,12 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): ) self.sql_parser_schema_resolver = self._init_schema_resolver() + self.data_reader: Optional[BigQueryDataReader] = None + if self.classification_handler.is_classification_enabled(): + self.data_reader = BigQueryDataReader.create( + self.config.get_bigquery_client() + ) + redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None @@ -713,6 +730,7 @@ def _process_schema( ) columns = None + if ( self.config.include_tables or self.config.include_views @@ -732,12 +750,27 @@ def _process_schema( for table in db_tables[dataset_name]: table_columns = columns.get(table.name, []) if columns else [] - yield from self._process_table( + table_wu_generator = self._process_table( table=table, columns=table_columns, project_id=project_id, dataset_name=dataset_name, ) + yield from classification_workunit_processor( + table_wu_generator, + self.classification_handler, + self.data_reader, + [project_id, dataset_name, table.name], + data_reader_kwargs=dict( + sample_size_percent=( + self.config.classification.sample_size + * 1.2 + / table.rows_count + if table.rows_count + else None + ) + ), + ) elif self.store_table_refs: # Need table_refs to calculate lineage and usage for table_item in self.bigquery_data_dictionary.list_tables( @@ -1071,14 +1104,16 @@ def gen_dataset_workunits( ) yield self.gen_schema_metadata( - dataset_urn, table, columns, str(datahub_dataset_name) + dataset_urn, table, columns, datahub_dataset_name ) dataset_properties = DatasetProperties( name=datahub_dataset_name.get_table_display_name(), - description=unquote_and_decode_unicode_escape_seq(table.comment) - if table.comment - else "", + description=( + unquote_and_decode_unicode_escape_seq(table.comment) + if table.comment + else "" + ), qualifiedName=str(datahub_dataset_name), created=( TimeStamp(time=int(table.created.timestamp() * 1000)) @@ -1238,10 +1273,10 @@ def gen_schema_metadata( dataset_urn: str, table: Union[BigqueryTable, BigqueryView, BigqueryTableSnapshot], columns: List[BigqueryColumn], - dataset_name: str, + dataset_name: BigqueryTableIdentifier, ) -> MetadataWorkUnit: schema_metadata = SchemaMetadata( - schemaName=dataset_name, + schemaName=str(dataset_name), platform=make_data_platform_urn(self.platform), version=0, hash="", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 2f4978d49e687..28f0be2c38033 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -10,6 +10,9 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.validate_field_removal import pydantic_removed_field +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationSourceConfigMixin, +) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, @@ -64,9 +67,9 @@ def __init__(self, **data: Any): ) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path - def get_bigquery_client(config) -> bigquery.Client: - client_options = config.extra_client_options - return bigquery.Client(config.project_on_behalf, **client_options) + def get_bigquery_client(self) -> bigquery.Client: + client_options = self.extra_client_options + return bigquery.Client(self.project_on_behalf, **client_options) def make_gcp_logging_client( self, project_id: Optional[str] = None @@ -96,6 +99,7 @@ class BigQueryV2Config( StatefulUsageConfigMixin, StatefulLineageConfigMixin, StatefulProfilingConfigMixin, + ClassificationSourceConfigMixin, ): project_id_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py new file mode 100644 index 0000000000000..37dfd14ce125e --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py @@ -0,0 +1,72 @@ +import logging +from collections import defaultdict +from typing import Dict, List, Optional + +from google.cloud import bigquery + +from datahub.ingestion.source.sql.data_reader import DataReader +from datahub.utilities.perf_timer import PerfTimer + +logger = logging.Logger(__name__) + + +class BigQueryDataReader(DataReader): + @staticmethod + def create( + client: bigquery.Client, + ) -> "BigQueryDataReader": + return BigQueryDataReader(client) + + def __init__( + self, + client: bigquery.Client, + ) -> None: + self.client = client + + def get_sample_data_for_table( + self, + table_id: List[str], + sample_size: int, + *, + sample_size_percent: Optional[float] = None, + filter: Optional[str] = None, + ) -> Dict[str, list]: + """ + table_id should be in the form [project, dataset, schema] + """ + + assert len(table_id) == 3 + project = table_id[0] + dataset = table_id[1] + table_name = table_id[2] + + column_values: Dict[str, list] = defaultdict(list) + if sample_size_percent is None: + return column_values + # Ideally we always know the actual row count. + # The alternative to perform limit query scans entire BQ table + # and is never a recommended option due to cost factor, unless + # additional filter clause (e.g. where condition on partition) is available. + + logger.debug( + f"Collecting sample values for table {project}.{dataset}.{table_name}" + ) + with PerfTimer() as timer: + sample_pc = sample_size_percent * 100 + # TODO: handle for sharded+compulsory partitioned tables + sql = ( + f"SELECT * FROM `{project}.{dataset}.{table_name}` " + + f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)" + ) + # Ref: https://cloud.google.com/bigquery/docs/samples/bigquery-query-results-dataframe + df = self.client.query_and_wait(sql).to_dataframe() + time_taken = timer.elapsed_seconds() + logger.debug( + f"Finished collecting sample values for table {project}.{dataset}.{table_name};" + f"{df.shape[0]} rows; took {time_taken:.3f} seconds" + ) + + return df.to_dict(orient="list") + + def close(self) -> None: + self.client.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index ad7b86219e7c1..54eca61dfe1c9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -7,6 +7,7 @@ import pydantic from datahub.ingestion.api.report import Report +from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport @@ -42,7 +43,12 @@ class BigQueryProcessingPerfReport(Report): @dataclass -class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): +class BigQueryV2Report( + ProfilingSqlReport, + IngestionStageReport, + BaseTimeWindowReport, + ClassificationReportMixin, +): num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict) num_skipped_lineage_entries_missing_data: TopKDict[str, int] = field( default_factory=int_top_k_dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index 4083eb6db77c1..dbaf28fabc9d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -91,7 +91,7 @@ def generate_partition_profiler_query( ) else: logger.warning( - f"Partitioned table {table.name} without partiton column" + f"Partitioned table {table.name} without partition column" ) self.report.profiling_skipped_invalid_partition_ids[ f"{project}.{schema}.{table.name}" diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 6a642e6566132..27b44bafc4b4e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -9,6 +9,9 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import DatasetLineageProviderConfigBase from datahub.configuration.validate_field_removal import pydantic_removed_field +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationSourceConfigMixin, +) from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -70,6 +73,7 @@ class RedshiftConfig( RedshiftUsageConfig, StatefulLineageConfigMixin, StatefulProfilingConfigMixin, + ClassificationSourceConfigMixin, ): database: str = Field(default="dev", description="database") diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index 1a38f11a52449..1c7d275c34867 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -455,49 +455,70 @@ def list_insert_create_queries_sql( db_name: str, start_time: datetime, end_time: datetime ) -> str: return """ - select - distinct cluster, - target_schema, - target_table, - username, - query as query_id, - LISTAGG(CASE WHEN LEN(RTRIM(querytxt)) = 0 THEN querytxt ELSE RTRIM(querytxt) END) WITHIN GROUP (ORDER BY sequence) as ddl, - ANY_VALUE(pid) as session_id, - starttime as timestamp - from - ( + with query_txt as + ( select - distinct tbl as target_table_id, - sti.schema as target_schema, - sti.table as target_table, - sti.database as cluster, - usename as username, - text as querytxt, - sq.query, - sequence, - si.starttime as starttime, - pid + query, + pid, + LISTAGG(case + when LEN(RTRIM(text)) = 0 then text + else RTRIM(text) + end) within group ( + order by + sequence) as ddl from - stl_insert as si - join SVV_TABLE_INFO sti on - sti.table_id = tbl - left join svl_user_info sui on - si.userid = sui.usesysid - left join STL_QUERYTEXT sq on - si.query = sq.query - left join stl_load_commits slc on - slc.query = si.query - where + ( + select + query, + pid, + text, + sequence + from + STL_QUERYTEXT + where + sequence < 320 + order by + sequence + ) + group by + query, + pid + ) + select + distinct tbl as target_table_id, + sti.schema as target_schema, + sti.table as target_table, + sti.database as cluster, + usename as username, + ddl, + sq.query as query_id, + min(si.starttime) as starttime, + ANY_VALUE(pid) as session_id + from + stl_insert as si + left join SVV_TABLE_INFO sti on + sti.table_id = tbl + left join svl_user_info sui on + si.userid = sui.usesysid + left join query_txt sq on + si.query = sq.query + left join stl_load_commits slc on + slc.query = si.query + where sui.usename <> 'rdsdb' - and slc.query IS NULL and cluster = '{db_name}' + and slc.query IS NULL and si.starttime >= '{start_time}' and si.starttime < '{end_time}' - and sequence < 320 - ) as target_tables - group by cluster, query_id, target_schema, target_table, username, starttime - order by cluster, query_id, target_schema, target_table, starttime asc - """.format( + group by + target_table_id, + target_schema, + target_table, + cluster, + username, + ddl, + sq.query + """.format( # We need the original database name for filtering db_name=db_name, start_time=start_time.strftime(redshift_datetime_format), @@ -551,7 +572,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\n','\\n'), '(CREATE(?:[\\n\\s\\t]+(?:temp|temporary))?(?:[\\n\\s\\t]+)table(?:[\\n\\s\\t]+)[^\\n\\s\\t()-]+)', 0, 1, 'ipe'),'[\\n\\s\\t]+',' ',1,'p') as create_command, query_text, row_number() over ( - partition by TRIM(query_text) + partition by session_id, TRIM(query_text) order by start_time desc ) rn from @@ -615,7 +636,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: ) where - rn = 1; + rn = 1 """ # Add this join to the sql query for more metrics on completed queries @@ -936,6 +957,8 @@ def list_copy_commands_sql( # also similar happens if for example table name contains special characters quoted with " i.e. "test-table1" # it is also worth noting that "query_type" field from SYS_QUERY_HISTORY could be probably used to improve many # of complicated queries in this file + # However, note that we can't really use this query fully everywhere, despite it being simpler, because + # the SYS_QUERY_TEXT.text field is truncated to 4000 characters and strips out linebreaks. @staticmethod def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: start_time_str: str = start_time.strftime(redshift_datetime_format) @@ -955,7 +978,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: query_text, REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\n','\\n'), '(CREATE(?:[\\n\\s\\t]+(?:temp|temporary))?(?:[\\n\\s\\t]+)table(?:[\\n\\s\\t]+)[^\\n\\s\\t()-]+)', 0, 1, 'ipe'),'[\\n\\s\\t]+',' ',1,'p') AS create_command, ROW_NUMBER() OVER ( - PARTITION BY query_text + PARTITION BY session_id, query_text ORDER BY start_time DESC ) rn FROM @@ -990,6 +1013,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: ) WHERE rn = 1 + ORDER BY start_time ASC ; """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index ef290518acd08..b893c0833954d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -35,6 +35,10 @@ ) from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationHandler, + classification_workunit_processor, +) from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -43,6 +47,7 @@ from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2 from datahub.ingestion.source.redshift.profile import RedshiftProfiler +from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader from datahub.ingestion.source.redshift.redshift_schema import ( RedshiftColumn, RedshiftDataDictionary, @@ -52,6 +57,7 @@ ) from datahub.ingestion.source.redshift.report import RedshiftReport from datahub.ingestion.source.redshift.usage import RedshiftUsageExtractor +from datahub.ingestion.source.sql.data_reader import DataReader from datahub.ingestion.source.sql.sql_common import SqlWorkUnit from datahub.ingestion.source.sql.sql_types import resolve_postgres_modified_type from datahub.ingestion.source.sql.sql_utils import ( @@ -127,6 +133,11 @@ "Enabled by default, can be disabled via configuration `include_usage_statistics`", ) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class RedshiftSource(StatefulIngestionSourceBase, TestableSource): """ This plugin extracts the following: @@ -313,6 +324,7 @@ def __init__(self, config: RedshiftConfig, ctx: PipelineContext): self.catalog_metadata: Dict = {} self.config: RedshiftConfig = config self.report: RedshiftReport = RedshiftReport() + self.classification_handler = ClassificationHandler(self.config, self.report) self.platform = "redshift" self.domain_registry = None if self.config.domain: @@ -490,6 +502,15 @@ def process_schemas(self, connection, database): self.db_schemas[database][schema.name] = schema yield from self.process_schema(connection, database, schema) + def make_data_reader( + self, + connection: redshift_connector.Connection, + ) -> Optional[DataReader]: + if self.classification_handler.is_classification_enabled(): + return RedshiftDataReader.create(connection) + + return None + def process_schema( self, connection: redshift_connector.Connection, @@ -529,6 +550,7 @@ def process_schema( ) if self.config.include_tables: + data_reader = self.make_data_reader(connection) logger.info(f"Process tables in schema {database}.{schema.name}") if ( self.db_tables[schema.database] @@ -536,7 +558,15 @@ def process_schema( ): for table in self.db_tables[schema.database][schema.name]: table.columns = schema_columns[schema.name].get(table.name, []) - yield from self._process_table(table, database=database) + table_wu_generator = self._process_table( + table, database=database + ) + yield from classification_workunit_processor( + table_wu_generator, + self.classification_handler, + data_reader, + [schema.database, schema.name, table.name], + ) self.report.table_processed[report_key] = ( self.report.table_processed.get( f"{database}.{schema.name}", 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_data_reader.py new file mode 100644 index 0000000000000..5b92cf5c45688 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_data_reader.py @@ -0,0 +1,48 @@ +import logging +from typing import Any, Dict, List + +import redshift_connector + +from datahub.ingestion.source.sql.data_reader import DataReader +from datahub.utilities.perf_timer import PerfTimer + +logger = logging.Logger(__name__) + + +class RedshiftDataReader(DataReader): + @staticmethod + def create(conn: redshift_connector.Connection) -> "RedshiftDataReader": + return RedshiftDataReader(conn) + + def __init__(self, conn: redshift_connector.Connection) -> None: + # The lifecycle of this connection is managed externally + self.conn = conn + + def get_sample_data_for_table( + self, table_id: List[str], sample_size: int, **kwargs: Any + ) -> Dict[str, list]: + """ + For redshift, table_id should be in form (db_name, schema_name, table_name) + """ + assert len(table_id) == 3 + db_name = table_id[0] + schema_name = table_id[1] + table_name = table_id[2] + + logger.debug( + f"Collecting sample values for table {db_name}.{schema_name}.{table_name}" + ) + with PerfTimer() as timer, self.conn.cursor() as cursor: + sql = f"select * from {db_name}.{schema_name}.{table_name} limit {sample_size};" + cursor.execute(sql) + df = cursor.fetch_dataframe() + # Fetch the result set from the cursor and deliver it as the Pandas DataFrame. + time_taken = timer.elapsed_seconds() + logger.debug( + f"Finished collecting sample values for table {db_name}.{schema_name}.{table_name};" + f"{df.shape[0]} rows; took {time_taken:.3f} seconds" + ) + return df.to_dict(orient="list") + + def close(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py index 586771c4fb7b5..f3a8dfb8f3e85 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py @@ -107,6 +107,12 @@ class AlterTableRow: start_time: datetime +def _stringy(x: Optional[int]) -> Optional[str]: + if x is None: + return None + return str(x) + + # this is a class to be a proxy to query Redshift class RedshiftDataDictionary: def __init__(self, is_serverless): @@ -419,9 +425,8 @@ def get_lineage_rows( else None ), session_id=( - str(row[field_names.index("session_id")]) + _stringy(row[field_names.index("session_id")]) if "session_id" in field_names - and row[field_names.index("session_id")] else None ), ) @@ -441,9 +446,13 @@ def get_temporary_rows( rows = cursor.fetchmany() while rows: for row in rows: + # Skipping roews with no session_id + session_id = _stringy(row[field_names.index("session_id")]) + if session_id is None: + continue yield TempTableRow( transaction_id=row[field_names.index("transaction_id")], - session_id=row[field_names.index("session_id")], + session_id=session_id, # See https://docs.aws.amazon.com/redshift/latest/dg/r_STL_QUERYTEXT.html # for why we need to replace the \n with a newline. query_text=row[field_names.index("query_text")].replace( @@ -468,9 +477,12 @@ def get_alter_table_commands( rows = cursor.fetchmany() while rows: for row in rows: + session_id = _stringy(row[field_names.index("session_id")]) + if session_id is None: + continue yield AlterTableRow( transaction_id=row[field_names.index("transaction_id")], - session_id=row[field_names.index("session_id")], + session_id=session_id, query_text=row[field_names.index("query_text")], start_time=row[field_names.index("start_time")], ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py index 6c2a12498f2c0..e2a035091d0ad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py @@ -2,6 +2,7 @@ from datetime import datetime from typing import Dict, Optional +from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport @@ -11,7 +12,12 @@ @dataclass -class RedshiftReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): +class RedshiftReport( + ProfilingSqlReport, + IngestionStageReport, + BaseTimeWindowReport, + ClassificationReportMixin, +): num_usage_workunits_emitted: Optional[int] = None num_operational_stats_workunits_emitted: Optional[int] = None upstream_lineage: LossyDict = field(default_factory=LossyDict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py new file mode 100644 index 0000000000000..afb8cca707160 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py @@ -0,0 +1,57 @@ +import logging +from typing import Any, Callable, Dict, List + +import pandas as pd +from snowflake.connector import SnowflakeConnection + +from datahub.ingestion.source.sql.data_reader import DataReader +from datahub.utilities.perf_timer import PerfTimer + +logger = logging.Logger(__name__) + + +class SnowflakeDataReader(DataReader): + @staticmethod + def create( + conn: SnowflakeConnection, col_name_preprocessor: Callable[[str], str] + ) -> "SnowflakeDataReader": + return SnowflakeDataReader(conn, col_name_preprocessor) + + def __init__( + self, conn: SnowflakeConnection, col_name_preprocessor: Callable[[str], str] + ) -> None: + # The lifecycle of this connection is managed externally + self.conn = conn + self.col_name_preprocessor = col_name_preprocessor + + def get_sample_data_for_table( + self, table_id: List[str], sample_size: int, **kwargs: Any + ) -> Dict[str, list]: + """ + For snowflake, table_id should be in form (db_name, schema_name, table_name) + """ + + assert len(table_id) == 3 + db_name = table_id[0] + schema_name = table_id[1] + table_name = table_id[2] + + logger.debug( + f"Collecting sample values for table {db_name}.{schema_name}.{table_name}" + ) + with PerfTimer() as timer, self.conn.cursor() as cursor: + sql = f'select * from "{db_name}"."{schema_name}"."{table_name}" sample ({sample_size} rows);' + cursor.execute(sql) + dat = cursor.fetchall() + # Fetch the result set from the cursor and deliver it as the Pandas DataFrame. + df = pd.DataFrame(dat, columns=[col.name for col in cursor.description]) + df.columns = [self.col_name_preprocessor(col) for col in df.columns] + time_taken = timer.elapsed_seconds() + logger.debug( + f"Finished collecting sample values for table {db_name}.{schema_name}.{table_name};" + f"{df.shape[0]} rows; took {time_taken:.3f} seconds" + ) + return df.to_dict(orient="list") + + def close(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 9526bdec4b05d..292c57494632c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -5,7 +5,6 @@ from functools import lru_cache from typing import Dict, List, Optional -import pandas as pd from snowflake.connector import SnowflakeConnection from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain @@ -84,7 +83,6 @@ class SnowflakeTable(BaseTable): foreign_keys: List[SnowflakeFK] = field(default_factory=list) tags: Optional[List[SnowflakeTag]] = None column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict) - sample_data: Optional[pd.DataFrame] = None @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 591bdffed5819..318cec8482996 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -7,7 +7,6 @@ from functools import partial from typing import Callable, Dict, Iterable, List, Optional, Union -import pandas as pd from snowflake.connector import SnowflakeConnection from datahub.configuration.pattern_utils import is_schema_allowed @@ -37,7 +36,10 @@ TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.glossary.classification_mixin import ClassificationHandler +from datahub.ingestion.glossary.classification_mixin import ( + ClassificationHandler, + classification_workunit_processor, +) from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -52,6 +54,7 @@ SnowflakeV2Config, TagOption, ) +from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -134,7 +137,6 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator -from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.registries.domain_registry import DomainRegistry logger: logging.Logger = logging.getLogger(__name__) @@ -212,6 +214,11 @@ "Optionally enabled via `extract_tags`", supported=True, ) +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class SnowflakeV2Source( SnowflakeQueryMixin, SnowflakeConnectionMixin, @@ -305,10 +312,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): config, self.report, self.profiling_state_handler ) - if self.config.classification.enabled: - self.classification_handler = ClassificationHandler( - self.config, self.report - ) + self.classification_handler = ClassificationHandler(self.config, self.report) # Caches tables for a single database. Consider moving to disk or S3 when possible. self.db_tables: Dict[str, List[SnowflakeTable]] = {} @@ -422,6 +426,9 @@ def query(query): _report[SourceCapability.DATA_PROFILING] = CapabilityReport( capable=True ) + _report[SourceCapability.CLASSIFICATION] = CapabilityReport( + capable=True + ) if privilege.object_name.startswith("SNOWFLAKE.ACCOUNT_USAGE."): # if access to "snowflake" shared database, access to all account_usage views is automatically granted @@ -459,6 +466,7 @@ def query(query): SourceCapability.SCHEMA_METADATA: "Either no tables exist or current role does not have permissions to access them", SourceCapability.DESCRIPTIONS: "Either no tables exist or current role does not have permissions to access them", SourceCapability.DATA_PROFILING: "Either no tables exist or current role does not have permissions to access them", + SourceCapability.CLASSIFICATION: "Either no tables exist or current role does not have permissions to access them", SourceCapability.CONTAINERS: "Current role does not have permissions to use any database", SourceCapability.LINEAGE_COARSE: "Current role does not have permissions to snowflake account usage views", SourceCapability.LINEAGE_FINE: "Current role does not have permissions to snowflake account usage views", @@ -472,6 +480,7 @@ def query(query): SourceCapability.SCHEMA_METADATA, SourceCapability.DESCRIPTIONS, SourceCapability.DATA_PROFILING, + SourceCapability.CLASSIFICATION, SourceCapability.LINEAGE_COARSE, SourceCapability.LINEAGE_FINE, SourceCapability.USAGE_STATS, @@ -775,8 +784,17 @@ def _process_schema( self.db_tables[schema_name] = tables if self.config.include_technical_schema: + data_reader = self.make_data_reader() for table in tables: - yield from self._process_table(table, schema_name, db_name) + table_wu_generator = self._process_table( + table, schema_name, db_name + ) + yield from classification_workunit_processor( + table_wu_generator, + self.classification_handler, + data_reader, + [db_name, schema_name, table.name], + ) if self.config.include_views: views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name) @@ -876,6 +894,14 @@ def fetch_tables_for_schema( ) return [] + def make_data_reader(self) -> Optional[SnowflakeDataReader]: + if self.classification_handler.is_classification_enabled() and self.connection: + return SnowflakeDataReader.create( + self.connection, self.snowflake_identifier + ) + + return None + def _process_table( self, table: SnowflakeTable, @@ -890,12 +916,6 @@ def _process_table( self.fetch_foreign_keys_for_table(table, schema_name, db_name, table_identifier) - dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - - self.fetch_sample_data_for_classification( - table, schema_name, db_name, dataset_name - ) - if self.config.extract_tags != TagOption.skip: table.tags = self.tag_extractor.get_tags_on_object( table_name=table.name, @@ -914,36 +934,6 @@ def _process_table( yield from self.gen_dataset_workunits(table, schema_name, db_name) - def fetch_sample_data_for_classification( - self, table: SnowflakeTable, schema_name: str, db_name: str, dataset_name: str - ) -> None: - if ( - table.columns - and self.config.classification.enabled - and self.classification_handler.is_classification_enabled_for_table( - dataset_name - ) - ): - try: - table.sample_data = self.get_sample_values_for_table( - table.name, schema_name, db_name - ) - except Exception as e: - logger.debug( - f"Failed to get sample values for dataset {dataset_name} due to error {e}", - exc_info=e, - ) - if isinstance(e, SnowflakePermissionError): - self.report_warning( - "Failed to get sample values for dataset. Please grant SELECT permissions on dataset.", - dataset_name, - ) - else: - self.report_warning( - "Failed to get sample values for dataset", - dataset_name, - ) - def fetch_foreign_keys_for_table( self, table: SnowflakeTable, @@ -1073,9 +1063,7 @@ def gen_dataset_workunits( ).as_workunit() schema_metadata = self.gen_schema_metadata(table, schema_name, db_name) - # TODO: classification is only run for snowflake tables. - # Should we run classification for snowflake views as well? - self.classify_snowflake_table(table, dataset_name, schema_metadata) + yield MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=schema_metadata ).as_workunit() @@ -1296,47 +1284,6 @@ def build_foreign_keys( ) return foreign_keys - def classify_snowflake_table( - self, - table: Union[SnowflakeTable, SnowflakeView], - dataset_name: str, - schema_metadata: SchemaMetadata, - ) -> None: - if ( - isinstance(table, SnowflakeTable) - and self.config.classification.enabled - and self.classification_handler.is_classification_enabled_for_table( - dataset_name - ) - ): - if table.sample_data is not None: - table.sample_data.columns = [ - self.snowflake_identifier(col) for col in table.sample_data.columns - ] - - try: - self.classification_handler.classify_schema_fields( - dataset_name, - schema_metadata, - ( - table.sample_data.to_dict(orient="list") - if table.sample_data is not None - else {} - ), - ) - except Exception as e: - logger.debug( - f"Failed to classify table columns for {dataset_name} due to error -> {e}", - exc_info=e, - ) - self.report_warning( - "Failed to classify table columns", - dataset_name, - ) - finally: - # Cleaning up sample_data fetched for classification - table.sample_data = None - def get_report(self) -> SourceReport: return self.report @@ -1551,37 +1498,6 @@ def inspect_session_metadata(self) -> None: except Exception: self.report.edition = None - # Ideally we do not want null values in sample data for a column. - # However that would require separate query per column and - # that would be expensive, hence not done. To compensale for possibility - # of some null values in collected sample, we fetch extra (20% more) - # rows than configured sample_size. - def get_sample_values_for_table( - self, table_name: str, schema_name: str, db_name: str - ) -> pd.DataFrame: - # Create a cursor object. - logger.debug( - f"Collecting sample values for table {db_name}.{schema_name}.{table_name}" - ) - - actual_sample_size = self.config.classification.sample_size * 1.2 - with PerfTimer() as timer: - cur = self.get_connection().cursor() - # Execute a statement that will generate a result set. - sql = f'select * from "{db_name}"."{schema_name}"."{table_name}" sample ({actual_sample_size} rows);' - - cur.execute(sql) - # Fetch the result set from the cursor and deliver it as the Pandas DataFrame. - - dat = cur.fetchall() - df = pd.DataFrame(dat, columns=[col.name for col in cur.description]) - time_taken = timer.elapsed_seconds() - logger.debug( - f"Finished collecting sample values for table {db_name}.{schema_name}.{table_name};{df.shape[0]} rows; took {time_taken:.3f} seconds" - ) - - return df - # domain is either "view" or "table" def get_external_url_for_table( self, table_name: str, schema_name: str, db_name: str, domain: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index c3759875b2769..eed5b1cb6c9eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -291,6 +291,11 @@ def get_sql_alchemy_url(self): ) @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables") @capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class AthenaSource(SQLAlchemySource): """ This plugin supports extracting the following metadata from Athena diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 84c1d3844a7b4..7d32b5a20df11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -380,6 +380,11 @@ def get_columns(self, connection, table_name, schema=None, **kw): @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class ClickHouseSource(TwoTierSQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py index 73730a9ea0ef7..75d3236a0a5ad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/data_reader.py @@ -1,29 +1,62 @@ import logging from abc import abstractmethod from collections import defaultdict -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union import sqlalchemy as sa from sqlalchemy.engine import Connection, Engine from sqlalchemy.engine.reflection import Inspector -from sqlalchemy.engine.row import LegacyRow from datahub.ingestion.api.closeable import Closeable +from datahub.utilities.perf_timer import PerfTimer logger: logging.Logger = logging.getLogger(__name__) class DataReader(Closeable): - @abstractmethod def get_sample_data_for_column( - self, table_id: List[str], column_name: str, sample_size: int = 100 + self, table_id: List[str], column_name: str, sample_size: int ) -> list: - pass + raise NotImplementedError() @abstractmethod def get_sample_data_for_table( - self, table_id: List[str], sample_size: int = 100 + self, + table_id: List[str], + sample_size: int, + *, + sample_size_percent: Optional[float] = None, + filter: Optional[str] = None, ) -> Dict[str, list]: + """ + Fetches table values , approx sample_size rows + + Args: + table_id (List[str]): Table name identifier. One of + - [, , ] or + - [, ] or + - [] + sample_size (int): sample size + + Keyword Args: + sample_size_percent(float, between 0 and 1): For bigquery-like data platforms that provide only + percentage based sampling methods. If present, actual sample_size + may be ignored. + + filter (string): For bigquery-like data platforms that need mandatory filter on partition + column for some cases + + + Returns: + Dict[str, list]: dictionary of (column name -> list of column values) + """ + + # Ideally we do not want null values in sample data for a column. + # However that would require separate query per column and + # that would be expensive, hence not done. To compensate for possibility + # of some null values in collected sample, its usually recommended to + # fetch extra (20% more) rows than configured sample_size. + pass @@ -36,8 +69,7 @@ def __init__( self, conn: Union[Engine, Connection], ) -> None: - # TODO: How can this use a connection pool instead ? - self.engine = conn.engine.connect() + self.connection = conn.engine.connect() def _table(self, table_id: List[str]) -> sa.Table: return sa.Table( @@ -46,91 +78,42 @@ def _table(self, table_id: List[str]) -> sa.Table: schema=table_id[-2] if len(table_id) > 1 else None, ) - def get_sample_data_for_column( - self, table_id: List[str], column_name: str, sample_size: int = 100 - ) -> list: - """ - Fetches non-null column values, upto count - Args: - table_id: Table name identifier. One of - - [, , ] or - - [, ] or - - [] - column: Column name - Returns: - list of column values - """ - - table = self._table(table_id) - query: Any - ignore_null_condition = sa.column(column_name).is_(None) - # limit doesn't compile properly for oracle so we will append rownum to query string later - if self.engine.dialect.name.lower() == "oracle": - raw_query = ( - sa.select([sa.column(column_name)]) - .select_from(table) - .where(sa.not_(ignore_null_condition)) - ) - - query = str( - raw_query.compile(self.engine, compile_kwargs={"literal_binds": True}) - ) - query += "\nAND ROWNUM <= %d" % sample_size - else: - query = ( - sa.select([sa.column(column_name)]) - .select_from(table) - .where(sa.not_(ignore_null_condition)) - .limit(sample_size) - ) - query_results = self.engine.execute(query) - - return [x[column_name] for x in query_results.fetchall()] - def get_sample_data_for_table( - self, table_id: List[str], sample_size: int = 100 + self, table_id: List[str], sample_size: int, **kwargs: Any ) -> Dict[str, list]: - """ - Fetches table values, upto *1.2 count - Args: - table_id: Table name identifier. One of - - [, , ] or - - [, ] or - - [] - Returns: - dictionary of (column name -> list of column values) - """ - column_values: Dict[str, list] = defaultdict(list) - table = self._table(table_id) - # Ideally we do not want null values in sample data for a column. - # However that would require separate query per column and - # that would be expensiv. To compensate for possibility - # of some null values in collected sample, we fetch extra (20% more) - # rows than configured sample_size. - sample_size = int(sample_size * 1.2) + logger.debug(f"Collecting sample values for table {'.'.join(table_id)}") - query: Any + with PerfTimer() as timer: + column_values: Dict[str, list] = defaultdict(list) + table = self._table(table_id) - # limit doesn't compile properly for oracle so we will append rownum to query string later - if self.engine.dialect.name.lower() == "oracle": - raw_query = sa.select([sa.text("*")]).select_from(table) + query: Any - query = str( - raw_query.compile(self.engine, compile_kwargs={"literal_binds": True}) - ) - query += "\nAND ROWNUM <= %d" % sample_size - else: - query = sa.select([sa.text("*")]).select_from(table).limit(sample_size) - query_results = self.engine.execute(query) - - # Not ideal - creates a parallel structure in column_values. Can we use pandas here ? - for row in query_results.fetchall(): - if isinstance(row, LegacyRow): - for col, col_value in row.items(): - column_values[col].append(col_value) + # limit doesn't compile properly for oracle so we will append rownum to query string later + if self.connection.dialect.name.lower() == "oracle": + raw_query = sa.select([sa.text("*")]).select_from(table) + query = str( + raw_query.compile( + self.connection, compile_kwargs={"literal_binds": True} + ) + ) + query += "\nAND ROWNUM <= %d" % sample_size + else: + query = sa.select([sa.text("*")]).select_from(table).limit(sample_size) + query_results = self.connection.execute(query) + + # Not ideal - creates a parallel structure in column_values. Can we use pandas here ? + for row in query_results.fetchall(): + for col, col_value in row._mapping.items(): + column_values[col].append(col_value) + time_taken = timer.elapsed_seconds() + logger.debug( + f"Finished collecting sample values for table {'.'.join(table_id)};" + f"took {time_taken:.3f} seconds" + ) return column_values def close(self) -> None: - self.engine.close() + self.connection.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py index 3f20e0a0f18b6..fdec869baa583 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py @@ -61,6 +61,11 @@ def get_identifier(self, schema: str, table: str) -> str: @config_class(DruidConfig) @support_status(SupportStatus.INCUBATING) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class DruidSource(SQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hana.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hana.py index 5c9c8f063a1a9..40875809120de 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hana.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hana.py @@ -28,6 +28,11 @@ class HanaConfig(BasicSQLAlchemyConfig): @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class HanaSource(SQLAlchemySource): def __init__(self, config: HanaConfig, ctx: PipelineContext): super().__init__(config, ctx, "hana") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 003732236ba80..2975bfe820d1b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -134,6 +134,11 @@ def clean_host_port(cls, v): @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class HiveSource(TwoTierSQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py index 9b482beba924f..f3e2cccb9e8d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py @@ -66,6 +66,11 @@ def get_identifier(self, *, schema: str, table: str) -> str: @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class MySQLSource(TwoTierSQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index bcf0f26008ae3..cf7bdc982ee80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -560,6 +560,11 @@ def __getattr__(self, item: str) -> Any: @config_class(OracleConfig) @support_status(SupportStatus.INCUBATING) @capability(SourceCapability.DOMAINS, "Enabled by default") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class OracleSource(SQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index 5d1e37fbb68a3..20976c91f7878 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -132,6 +132,11 @@ class PostgresConfig(BasePostgresConfig): @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class PostgresSource(SQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py index 9657fdab9e2e3..98e2f2ecfbd5a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py @@ -160,6 +160,11 @@ def get_sql_alchemy_url( @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class PrestoOnHiveSource(SQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 9ec30d57b8f76..91736b24727c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -820,13 +820,15 @@ def _classify( dataset_name ) and data_reader + and schema_metadata.fields ): self.classification_handler.classify_schema_fields( dataset_name, schema_metadata, - data_reader.get_sample_data_for_table( - table_id=[schema, table], - sample_size=self.config.classification.sample_size, + partial( + data_reader.get_sample_data_for_table, + [schema, table], + int(self.config.classification.sample_size * 1.2), ), ) except Exception as e: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py index 345f5bd57b44c..78b0dcf9b7be8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py @@ -64,7 +64,7 @@ def get_sql_alchemy_url(self): @platform_name("SQLAlchemy", id="sqlalchemy") @config_class(SQLAlchemyGenericConfig) -@support_status(SupportStatus.CERTIFIED) +@support_status(SupportStatus.INCUBATING) @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class SQLAlchemyGenericSource(SQLAlchemySource): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index 53b1ddfcde595..3d0bacba74a69 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -447,6 +447,11 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") @capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration") @capability(SourceCapability.USAGE_STATS, "Optionally enabled via configuration") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class TeradataSource(TwoTierSQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py index 7668cb01f84bc..1828c5101d4f3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py @@ -226,6 +226,11 @@ def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class TrinoSource(SQLAlchemySource): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index 32f1ba5b8d563..9800660a9ad54 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -120,6 +120,11 @@ def clean_host_port(cls, v): "Optionally enabled via `stateful_ingestion.remove_stale_metadata`", supported=True, ) +@capability( + SourceCapability.CLASSIFICATION, + "Optionally enabled via `classification.enabled`", + supported=True, +) class VerticaSource(SQLAlchemySource): def __init__(self, config: VerticaConfig, ctx: PipelineContext): # self.platform = platform diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index ec52e839212c5..5e2e510533af1 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -36,6 +36,9 @@ class SchemaResolverInterface(Protocol): def platform(self) -> str: ... + def includes_temp_tables(self) -> bool: + ... + def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: ... @@ -74,6 +77,9 @@ def __init__( def platform(self) -> str: return self._platform + def includes_temp_tables(self) -> bool: + return False + def get_urns(self) -> Set[str]: return set(k for k, v in self._schema_cache.items() if v is not None) @@ -246,6 +252,9 @@ def __init__( def platform(self) -> str: return self._base_resolver.platform + def includes_temp_tables(self) -> bool: + return True + def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: urn = self._base_resolver.get_urn_for_table( table, lower=self._base_resolver._prefers_urn_lower() diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 8edb131c23297..495f4abfce7d4 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -1,8 +1,10 @@ +import contextlib import dataclasses import enum import itertools import json import logging +import os import pathlib import tempfile import uuid @@ -15,6 +17,7 @@ from datahub.emitter.mce_builder import get_sys_time, make_ts_millis from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.sql_parsing_builder import compute_upstream_fields +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.report import Report from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.graph.client import DataHubGraph @@ -53,9 +56,6 @@ QueryId = str UrnStr = str -_DEFAULT_USER_URN = CorpUserUrn("_ingestion") -_MISSING_SESSION_ID = "__MISSING_SESSION_ID" - class QueryLogSetting(enum.Enum): DISABLED = "DISABLED" @@ -63,6 +63,23 @@ class QueryLogSetting(enum.Enum): STORE_FAILED = "STORE_FAILED" +_DEFAULT_USER_URN = CorpUserUrn("_ingestion") +_MISSING_SESSION_ID = "__MISSING_SESSION_ID" +_DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[ + os.getenv("DATAHUB_SQL_AGG_QUERY_LOG") or QueryLogSetting.DISABLED.name +] + + +@dataclasses.dataclass +class LoggedQuery: + query: str + session_id: Optional[str] + timestamp: Optional[datetime] + user: Optional[UrnStr] + default_db: Optional[str] + default_schema: Optional[str] + + @dataclasses.dataclass class ViewDefinition: view_definition: str @@ -87,6 +104,8 @@ class QueryMetadata: column_lineage: List[ColumnLineageInfo] confidence_score: float + used_temp_tables: bool = True + def make_created_audit_stamp(self) -> models.AuditStampClass: return models.AuditStampClass( time=make_ts_millis(self.latest_timestamp) or 0, @@ -149,6 +168,9 @@ class SqlAggregatorReport(Report): queries_with_temp_upstreams: LossyDict[QueryId, LossyList] = dataclasses.field( default_factory=LossyDict ) + queries_with_non_authoritative_session: LossyList[QueryId] = dataclasses.field( + default_factory=LossyList + ) # Lineage-related. schema_resolver_count: Optional[int] = None @@ -170,7 +192,7 @@ def compute_stats(self) -> None: return super().compute_stats() -class SqlParsingAggregator: +class SqlParsingAggregator(Closeable): def __init__( self, *, @@ -185,7 +207,7 @@ def __init__( usage_config: Optional[BaseUsageConfig] = None, is_temp_table: Optional[Callable[[UrnStr], bool]] = None, format_queries: bool = True, - query_log: QueryLogSetting = QueryLogSetting.DISABLED, + query_log: QueryLogSetting = _DEFAULT_QUERY_LOG_SETTING, ) -> None: self.platform = DataPlatformUrn(platform) self.platform_instance = platform_instance @@ -210,13 +232,18 @@ def __init__( self.format_queries = format_queries self.query_log = query_log + # The exit stack helps ensure that we close all the resources we open. + self._exit_stack = contextlib.ExitStack() + # Set up the schema resolver. self._schema_resolver: SchemaResolver if graph is None: - self._schema_resolver = SchemaResolver( - platform=self.platform.platform_name, - platform_instance=self.platform_instance, - env=self.env, + self._schema_resolver = self._exit_stack.enter_context( + SchemaResolver( + platform=self.platform.platform_name, + platform_instance=self.platform_instance, + env=self.env, + ) ) else: self._schema_resolver = None # type: ignore @@ -235,27 +262,33 @@ def __init__( # By providing a filename explicitly here, we also ensure that the file # is not automatically deleted on exit. - self._shared_connection = ConnectionWrapper(filename=query_log_path) + self._shared_connection = self._exit_stack.enter_context( + ConnectionWrapper(filename=query_log_path) + ) # Stores the logged queries. - self._logged_queries = FileBackedList[str]( + self._logged_queries = FileBackedList[LoggedQuery]( shared_connection=self._shared_connection, tablename="stored_queries" ) + self._exit_stack.push(self._logged_queries) # Map of query_id -> QueryMetadata self._query_map = FileBackedDict[QueryMetadata]( shared_connection=self._shared_connection, tablename="query_map" ) + self._exit_stack.push(self._query_map) # Map of downstream urn -> { query ids } self._lineage_map = FileBackedDict[OrderedSet[QueryId]]( shared_connection=self._shared_connection, tablename="lineage_map" ) + self._exit_stack.push(self._lineage_map) # Map of view urn -> view definition self._view_definitions = FileBackedDict[ViewDefinition]( shared_connection=self._shared_connection, tablename="view_definitions" ) + self._exit_stack.push(self._view_definitions) # Map of session ID -> {temp table name -> query id} # Needs to use the query_map to find the info about the query. @@ -263,16 +296,20 @@ def __init__( self._temp_lineage_map = FileBackedDict[Dict[UrnStr, QueryId]]( shared_connection=self._shared_connection, tablename="temp_lineage_map" ) + self._exit_stack.push(self._temp_lineage_map) # Map of query ID -> schema fields, only for query IDs that generate temp tables. self._inferred_temp_schemas = FileBackedDict[List[models.SchemaFieldClass]]( - shared_connection=self._shared_connection, tablename="inferred_temp_schemas" + shared_connection=self._shared_connection, + tablename="inferred_temp_schemas", ) + self._exit_stack.push(self._inferred_temp_schemas) # Map of table renames, from original UrnStr to new UrnStr. self._table_renames = FileBackedDict[UrnStr]( shared_connection=self._shared_connection, tablename="table_renames" ) + self._exit_stack.push(self._table_renames) # Usage aggregator. This will only be initialized if usage statistics are enabled. # TODO: Replace with FileBackedDict. @@ -281,6 +318,9 @@ def __init__( assert self.usage_config is not None self._usage_aggregator = UsageAggregator(config=self.usage_config) + def close(self) -> None: + self._exit_stack.close() + @property def _need_schemas(self) -> bool: return self.generate_lineage or self.generate_usage_statistics @@ -492,6 +532,7 @@ def add_observed_query( schema_resolver: SchemaResolverInterface = ( self._make_schema_resolver_for_session(session_id) ) + session_has_temp_tables = schema_resolver.includes_temp_tables() # Run the SQL parser. parsed = self._run_sql_parser( @@ -499,6 +540,9 @@ def add_observed_query( default_db=default_db, default_schema=default_schema, schema_resolver=schema_resolver, + session_id=session_id, + timestamp=query_timestamp, + user=user, ) if parsed.debug_info.error: self.report.observed_query_parse_failures.append( @@ -565,6 +609,7 @@ def add_observed_query( upstreams=parsed.in_tables, column_lineage=parsed.column_lineage or [], confidence_score=parsed.debug_info.confidence, + used_temp_tables=session_has_temp_tables, ) ) @@ -700,6 +745,9 @@ def _run_sql_parser( default_db: Optional[str], default_schema: Optional[str], schema_resolver: SchemaResolverInterface, + session_id: str = _MISSING_SESSION_ID, + timestamp: Optional[datetime] = None, + user: Optional[CorpUserUrn] = None, ) -> SqlParsingResult: parsed = sqlglot_lineage( query, @@ -712,7 +760,15 @@ def _run_sql_parser( if self.query_log == QueryLogSetting.STORE_ALL or ( self.query_log == QueryLogSetting.STORE_FAILED and parsed.debug_info.error ): - self._logged_queries.append(query) + query_log_entry = LoggedQuery( + query=query, + session_id=session_id if session_id != _MISSING_SESSION_ID else None, + timestamp=timestamp, + user=user.urn() if user else None, + default_db=default_db, + default_schema=default_schema, + ) + self._logged_queries.append(query_log_entry) # Also add some extra logging. if parsed.debug_info.error: @@ -734,10 +790,21 @@ def _add_to_query_map( # This assumes that queries come in order of increasing timestamps, # so the current query is more authoritative than the previous one. current.formatted_query_string = new.formatted_query_string - current.session_id = new.session_id current.latest_timestamp = new.latest_timestamp or current.latest_timestamp current.actor = new.actor or current.actor + if current.used_temp_tables and not new.used_temp_tables: + # If we see the same query again, but in a different session, + # it's possible that we didn't capture the temp tables in the newer session, + # but did in the older one. If that happens, we treat the older session's + # lineage as more authoritative. This isn't technically correct, but it's + # better than using the newer session's lineage, which is likely incorrect. + self.report.queries_with_non_authoritative_session.append( + query_fingerprint + ) + return + current.session_id = new.session_id + if not merge_lineage: # An invariant of the fingerprinting is that if two queries have the # same fingerprint, they must also have the same lineage. We overwrite @@ -1066,9 +1133,12 @@ def _recurse_into_query( # - Update the query text to combine the queries composite_query_id = self._composite_query_id(composed_of_queries) - self.report.queries_with_temp_upstreams.setdefault( - composite_query_id, LossyList() - ).extend(composed_of_queries) + composed_of_queries_truncated: LossyList[str] = LossyList() + for query_id in composed_of_queries: + composed_of_queries_truncated.append(query_id) + self.report.queries_with_temp_upstreams[ + composite_query_id + ] = composed_of_queries_truncated merged_query_text = ";\n\n".join( [ diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index 54f6a6e984c00..91f5d6f914676 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -62,9 +62,13 @@ def assert_metadata_files_equal( # We have to "normalize" the golden file by reading and writing it back out. # This will clean up nulls, double serialization, and other formatting issues. with tempfile.NamedTemporaryFile() as temp: - golden_metadata = read_metadata_file(pathlib.Path(golden_path)) - write_metadata_file(pathlib.Path(temp.name), golden_metadata) - golden = load_json_file(temp.name) + try: + golden_metadata = read_metadata_file(pathlib.Path(golden_path)) + write_metadata_file(pathlib.Path(temp.name), golden_metadata) + golden = load_json_file(temp.name) + except (ValueError, AssertionError) as e: + logger.info(f"Error reformatting golden file as MCP/MCEs: {e}") + golden = load_json_file(golden_path) diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order) if diff and update_golden: @@ -107,7 +111,7 @@ def diff_metadata_json( # if ignore_order is False, always use DeepDiff except CannotCompareMCPs as e: logger.info(f"{e}, falling back to MCE diff") - except AssertionError as e: + except (AssertionError, ValueError) as e: logger.warning(f"Reverting to old diff method: {e}") logger.debug("Error with new diff method", exc_info=True) diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index 821b69c968ee4..d264a3970fdde 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -126,6 +126,7 @@ def executemany( def close(self) -> None: for obj in self._dependent_objects: obj.close() + self._dependent_objects.clear() with self.conn_lock: self.conn.close() if self._temp_directory: @@ -440,7 +441,7 @@ def __del__(self) -> None: self.close() -class FileBackedList(Generic[_VT]): +class FileBackedList(Generic[_VT], Closeable): """An append-only, list-like object that stores its contents in a SQLite database.""" _len: int = field(default=0) @@ -456,7 +457,6 @@ def __init__( cache_max_size: Optional[int] = None, cache_eviction_batch_size: Optional[int] = None, ) -> None: - self._len = 0 self._dict = FileBackedDict[_VT]( shared_connection=shared_connection, tablename=tablename, @@ -468,6 +468,12 @@ def __init__( or _DEFAULT_MEMORY_CACHE_EVICTION_BATCH_SIZE, ) + if shared_connection: + shared_connection._dependent_objects.append(self) + + # In case we're reusing an existing list, we need to run a query to get the length. + self._len = len(self._dict) + @property def tablename(self) -> str: return self._dict.tablename diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index da9589d2195ac..f8763d48d35ef 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -236,7 +236,62 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Age" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false + } + ] } }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 602401134dcd3..e79bbbe995aae 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -1,11 +1,20 @@ +import random +import string from typing import Any, Dict from unittest.mock import patch from freezegun import freeze_time from google.cloud.bigquery.table import TableListItem +from datahub.ingestion.glossary.classifier import ( + ClassificationConfig, + DynamicTypedClassifierConfig, +) +from datahub.ingestion.glossary.datahub_classifier import DataHubClassifierConfig from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source +from datahub.ingestion.source.bigquery_v2.bigquery_data_reader import BigQueryDataReader from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( + BigqueryColumn, BigqueryDataset, BigQuerySchemaApi, BigqueryTable, @@ -16,13 +25,29 @@ FROZEN_TIME = "2022-02-03 07:00:00" +def random_email(): + return ( + "".join( + [ + random.choice(string.ascii_lowercase) + for i in range(random.randint(10, 15)) + ] + ) + + "@xyz.com" + ) + + @freeze_time(FROZEN_TIME) @patch.object(BigQuerySchemaApi, "get_tables_for_dataset") @patch.object(BigqueryV2Source, "get_core_table_details") @patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") +@patch.object(BigQuerySchemaApi, "get_columns_for_dataset") +@patch.object(BigQueryDataReader, "get_sample_data_for_table") @patch("google.cloud.bigquery.Client") def test_bigquery_v2_ingest( client, + get_sample_data_for_table, + get_columns_for_dataset, get_datasets_for_project_id, get_core_table_details, get_tables_for_dataset, @@ -42,6 +67,34 @@ def test_bigquery_v2_ingest( ) table_name = "table-1" get_core_table_details.return_value = {table_name: table_list_item} + get_columns_for_dataset.return_value = { + table_name: [ + BigqueryColumn( + name="age", + ordinal_position=1, + is_nullable=False, + field_path="col_1", + data_type="INT", + comment="comment", + is_partition_column=False, + cluster_column_position=None, + ), + BigqueryColumn( + name="email", + ordinal_position=1, + is_nullable=False, + field_path="col_2", + data_type="STRING", + comment="comment", + is_partition_column=False, + cluster_column_position=None, + ), + ] + } + get_sample_data_for_table.return_value = { + "age": [random.randint(1, 80) for i in range(20)], + "email": [random_email() for i in range(20)], + } bigquery_table = BigqueryTable( name=table_name, @@ -58,6 +111,18 @@ def test_bigquery_v2_ingest( "include_usage_statistics": False, "include_table_lineage": False, "include_data_platform_instance": True, + "classification": ClassificationConfig( + enabled=True, + classifiers=[ + DynamicTypedClassifierConfig( + type="datahub", + config=DataHubClassifierConfig( + minimum_values_threshold=1, + ), + ) + ], + max_workers=1, + ).dict(), } pipeline_config_dict: Dict[str, Any] = { diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json index d59fce788c95e..3c5b0027ea8ad 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json @@ -12,7 +12,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -34,7 +35,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -63,19 +65,19 @@ }, "fields": [ { - "fieldPath": "2", + "fieldPath": "Sampling Date", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.DateType": {} } }, - "nativeDataType": "string", + "nativeDataType": "date", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "3", + "fieldPath": "Site ID", "nullable": false, "type": { "type": { @@ -87,7 +89,7 @@ "isPartOfKey": false }, { - "fieldPath": "Br \n(mg/L)", + "fieldPath": "Park ID", "nullable": false, "type": { "type": { @@ -99,7 +101,7 @@ "isPartOfKey": false }, { - "fieldPath": "Ca \n(mg/L)", + "fieldPath": "Lat (\u00b0N)", "nullable": false, "type": { "type": { @@ -111,7 +113,7 @@ "isPartOfKey": false }, { - "fieldPath": "Cl \n(mg/L)", + "fieldPath": "Long (\u00b0W)", "nullable": false, "type": { "type": { @@ -123,7 +125,7 @@ "isPartOfKey": false }, { - "fieldPath": "Cond (\u00b5S/cm)", + "fieldPath": "Water Temp (\u00b0C)", "nullable": false, "type": { "type": { @@ -135,31 +137,31 @@ "isPartOfKey": false }, { - "fieldPath": "DO (mg/L)", + "fieldPath": "Cond (\u00b5S/cm)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "DOC [mg/L C]", + "fieldPath": "pH", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "F \n(mg/L)", + "fieldPath": "DO (mg/L)", "nullable": false, "type": { "type": { @@ -171,19 +173,19 @@ "isPartOfKey": false }, { - "fieldPath": "K \n(mg/L)", + "fieldPath": "Secchi Depth (m)", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "Lat (\u00b0N)", + "fieldPath": "UV Absorbance, 254nm", "nullable": false, "type": { "type": { @@ -195,7 +197,7 @@ "isPartOfKey": false }, { - "fieldPath": "Long (\u00b0W)", + "fieldPath": "DOC [mg/L C]", "nullable": false, "type": { "type": { @@ -207,7 +209,7 @@ "isPartOfKey": false }, { - "fieldPath": "Mg \n(mg/L)", + "fieldPath": "SUVA, 254nm", "nullable": false, "type": { "type": { @@ -243,31 +245,31 @@ "isPartOfKey": false }, { - "fieldPath": "Na \n(mg/L)", + "fieldPath": "PO4-P \n(mg P/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "PO4-P \n(mg P/L)", + "fieldPath": "TDN \n(mg N/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "Park ID", + "fieldPath": "TDP \n(mg P/L)", "nullable": false, "type": { "type": { @@ -279,7 +281,7 @@ "isPartOfKey": false }, { - "fieldPath": "SO4-S \n(mg/L)", + "fieldPath": "Cl \n(mg/L)", "nullable": false, "type": { "type": { @@ -291,7 +293,7 @@ "isPartOfKey": false }, { - "fieldPath": "SUVA, 254nm", + "fieldPath": "SO4-S \n(mg/L)", "nullable": false, "type": { "type": { @@ -303,7 +305,7 @@ "isPartOfKey": false }, { - "fieldPath": "Sampling Date", + "fieldPath": "F \n(mg/L)", "nullable": false, "type": { "type": { @@ -315,7 +317,7 @@ "isPartOfKey": false }, { - "fieldPath": "Secchi Depth (m)", + "fieldPath": "Br \n(mg/L)", "nullable": false, "type": { "type": { @@ -327,19 +329,19 @@ "isPartOfKey": false }, { - "fieldPath": "Site ID", + "fieldPath": "Na \n(mg/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "TDN \n(mg N/L)", + "fieldPath": "K \n(mg/L)", "nullable": false, "type": { "type": { @@ -351,19 +353,19 @@ "isPartOfKey": false }, { - "fieldPath": "TDP \n(mg P/L)", + "fieldPath": "Ca \n(mg/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "UV Absorbance, 254nm", + "fieldPath": "Mg \n(mg/L)", "nullable": false, "type": { "type": { @@ -375,19 +377,19 @@ "isPartOfKey": false }, { - "fieldPath": "Water Temp (\u00b0C)", + "fieldPath": "d18O", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "d18O", + "fieldPath": "dD", "nullable": false, "type": { "type": { @@ -399,7 +401,7 @@ "isPartOfKey": false }, { - "fieldPath": "dD", + "fieldPath": "field29", "nullable": false, "type": { "type": { @@ -411,7 +413,7 @@ "isPartOfKey": false }, { - "fieldPath": "field29", + "fieldPath": "2", "nullable": false, "type": { "type": { @@ -423,7 +425,7 @@ "isPartOfKey": false }, { - "fieldPath": "pH", + "fieldPath": "3", "nullable": false, "type": { "type": { @@ -439,7 +441,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -460,7 +463,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -481,7 +485,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -496,7 +501,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -512,7 +518,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -529,7 +536,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -549,7 +557,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -570,7 +579,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -585,7 +595,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -601,7 +612,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -618,7 +630,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -633,7 +646,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -657,7 +671,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +693,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +709,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -709,7 +726,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -726,7 +744,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -741,7 +760,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -769,7 +789,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -790,7 +811,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -805,7 +827,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -821,7 +844,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -838,7 +862,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -853,7 +878,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -885,7 +911,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -906,7 +933,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -921,7 +949,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -937,7 +966,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -954,7 +984,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -969,7 +1000,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1005,7 +1037,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1026,7 +1059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1041,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1057,7 +1092,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1074,7 +1110,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1089,7 +1126,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1129,7 +1167,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1150,7 +1189,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1165,7 +1205,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1181,7 +1222,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1198,7 +1240,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1213,7 +1256,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1257,7 +1301,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1278,7 +1323,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1293,7 +1339,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1309,7 +1356,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1326,7 +1374,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1341,7 +1390,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1389,7 +1439,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1404,7 +1455,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2701,7 +2753,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2753,7 +2806,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2769,7 +2823,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2791,7 +2846,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2884,7 +2940,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2905,7 +2962,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2920,7 +2978,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3270,7 +3329,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3322,7 +3382,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3338,7 +3399,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3360,7 +3422,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3453,7 +3516,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3474,7 +3538,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3489,7 +3554,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3839,7 +3905,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3891,7 +3958,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3913,7 +3981,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3929,7 +3998,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -3998,7 +4068,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4019,7 +4090,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4034,7 +4106,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4067,7 +4140,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4119,7 +4193,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4141,7 +4216,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4234,7 +4310,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4255,7 +4332,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4271,7 +4349,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4286,7 +4365,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4590,7 +4670,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4642,7 +4723,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4664,7 +4746,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4680,7 +4763,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4773,7 +4857,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4794,7 +4879,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4809,7 +4895,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -4972,7 +5059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -5024,7 +5112,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -5046,7 +5135,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -5451,7 +5541,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -5472,7 +5563,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -5487,7 +5579,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7647,7 +7740,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7699,7 +7793,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7715,7 +7810,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7730,7 +7826,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7745,7 +7842,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7760,7 +7858,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7775,7 +7874,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7790,7 +7890,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7805,7 +7906,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -7820,7 +7922,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index 58c225e1ec4c9..d7a9bca716fd6 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -12,7 +12,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -34,7 +35,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -63,19 +65,19 @@ }, "fields": [ { - "fieldPath": "2", + "fieldPath": "Sampling Date", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.DateType": {} } }, - "nativeDataType": "string", + "nativeDataType": "date", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "3", + "fieldPath": "Site ID", "nullable": false, "type": { "type": { @@ -87,7 +89,7 @@ "isPartOfKey": false }, { - "fieldPath": "Br \n(mg/L)", + "fieldPath": "Park ID", "nullable": false, "type": { "type": { @@ -99,7 +101,7 @@ "isPartOfKey": false }, { - "fieldPath": "Ca \n(mg/L)", + "fieldPath": "Lat (\u00b0N)", "nullable": false, "type": { "type": { @@ -111,7 +113,7 @@ "isPartOfKey": false }, { - "fieldPath": "Cl \n(mg/L)", + "fieldPath": "Long (\u00b0W)", "nullable": false, "type": { "type": { @@ -123,7 +125,7 @@ "isPartOfKey": false }, { - "fieldPath": "Cond (\u00b5S/cm)", + "fieldPath": "Water Temp (\u00b0C)", "nullable": false, "type": { "type": { @@ -135,31 +137,31 @@ "isPartOfKey": false }, { - "fieldPath": "DO (mg/L)", + "fieldPath": "Cond (\u00b5S/cm)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "DOC [mg/L C]", + "fieldPath": "pH", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "F \n(mg/L)", + "fieldPath": "DO (mg/L)", "nullable": false, "type": { "type": { @@ -171,19 +173,19 @@ "isPartOfKey": false }, { - "fieldPath": "K \n(mg/L)", + "fieldPath": "Secchi Depth (m)", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "Lat (\u00b0N)", + "fieldPath": "UV Absorbance, 254nm", "nullable": false, "type": { "type": { @@ -195,7 +197,7 @@ "isPartOfKey": false }, { - "fieldPath": "Long (\u00b0W)", + "fieldPath": "DOC [mg/L C]", "nullable": false, "type": { "type": { @@ -207,7 +209,7 @@ "isPartOfKey": false }, { - "fieldPath": "Mg \n(mg/L)", + "fieldPath": "SUVA, 254nm", "nullable": false, "type": { "type": { @@ -243,31 +245,31 @@ "isPartOfKey": false }, { - "fieldPath": "Na \n(mg/L)", + "fieldPath": "PO4-P \n(mg P/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "PO4-P \n(mg P/L)", + "fieldPath": "TDN \n(mg N/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "Park ID", + "fieldPath": "TDP \n(mg P/L)", "nullable": false, "type": { "type": { @@ -279,7 +281,7 @@ "isPartOfKey": false }, { - "fieldPath": "SO4-S \n(mg/L)", + "fieldPath": "Cl \n(mg/L)", "nullable": false, "type": { "type": { @@ -291,7 +293,7 @@ "isPartOfKey": false }, { - "fieldPath": "SUVA, 254nm", + "fieldPath": "SO4-S \n(mg/L)", "nullable": false, "type": { "type": { @@ -303,7 +305,7 @@ "isPartOfKey": false }, { - "fieldPath": "Sampling Date", + "fieldPath": "F \n(mg/L)", "nullable": false, "type": { "type": { @@ -315,7 +317,7 @@ "isPartOfKey": false }, { - "fieldPath": "Secchi Depth (m)", + "fieldPath": "Br \n(mg/L)", "nullable": false, "type": { "type": { @@ -327,19 +329,19 @@ "isPartOfKey": false }, { - "fieldPath": "Site ID", + "fieldPath": "Na \n(mg/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "TDN \n(mg N/L)", + "fieldPath": "K \n(mg/L)", "nullable": false, "type": { "type": { @@ -351,19 +353,19 @@ "isPartOfKey": false }, { - "fieldPath": "TDP \n(mg P/L)", + "fieldPath": "Ca \n(mg/L)", "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "number", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "UV Absorbance, 254nm", + "fieldPath": "Mg \n(mg/L)", "nullable": false, "type": { "type": { @@ -375,19 +377,19 @@ "isPartOfKey": false }, { - "fieldPath": "Water Temp (\u00b0C)", + "fieldPath": "d18O", "nullable": false, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "number", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "d18O", + "fieldPath": "dD", "nullable": false, "type": { "type": { @@ -399,7 +401,7 @@ "isPartOfKey": false }, { - "fieldPath": "dD", + "fieldPath": "field29", "nullable": false, "type": { "type": { @@ -411,7 +413,7 @@ "isPartOfKey": false }, { - "fieldPath": "field29", + "fieldPath": "2", "nullable": false, "type": { "type": { @@ -423,7 +425,7 @@ "isPartOfKey": false }, { - "fieldPath": "pH", + "fieldPath": "3", "nullable": false, "type": { "type": { @@ -439,7 +441,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -460,7 +463,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -481,7 +485,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -496,7 +501,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -512,7 +518,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -529,7 +536,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -549,7 +557,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -570,7 +579,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -585,7 +595,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -601,7 +612,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -618,7 +630,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -633,7 +646,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -657,7 +671,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +693,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +709,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -709,7 +726,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -726,7 +744,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -741,7 +760,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -769,7 +789,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -790,7 +811,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -805,7 +827,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -821,7 +844,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -838,7 +862,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -853,7 +878,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -885,7 +911,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -900,7 +927,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -936,7 +964,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -958,7 +987,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -974,7 +1004,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1067,7 +1098,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1088,7 +1120,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1103,7 +1136,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1139,7 +1173,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1155,7 +1190,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1177,7 +1213,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1270,7 +1307,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1291,7 +1329,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1306,7 +1345,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1342,7 +1382,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1364,7 +1405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1380,7 +1422,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1449,7 +1492,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1470,7 +1514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1485,7 +1530,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1521,7 +1567,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1543,7 +1590,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1559,7 +1607,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1652,7 +1701,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1673,7 +1723,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1688,7 +1739,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1724,7 +1776,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1740,7 +1793,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1762,7 +1816,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1855,7 +1910,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1876,7 +1932,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1891,7 +1948,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1927,7 +1985,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1949,7 +2008,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2354,7 +2414,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2375,7 +2436,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2391,7 +2453,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2406,7 +2469,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2442,7 +2506,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2457,7 +2522,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2472,7 +2538,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2487,7 +2554,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2502,7 +2570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2517,7 +2586,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2532,7 +2602,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } }, { @@ -2547,7 +2618,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "multiple_files.json" + "runId": "multiple_files.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 88354ba74c417..81487d38eda7d 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -4,7 +4,6 @@ from typing import cast from unittest import mock -import pandas as pd import pytest from freezegun import freeze_time @@ -65,7 +64,7 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): golden_file = test_resources_dir / "snowflake_golden.json" with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch( - "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table" + "datahub.ingestion.source.snowflake.snowflake_data_reader.SnowflakeDataReader.get_sample_data_for_table" ) as mock_sample_values: sf_connection = mock.MagicMock() sf_cursor = mock.MagicMock() @@ -74,13 +73,11 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): sf_cursor.execute.side_effect = default_query_results - mock_sample_values.return_value = pd.DataFrame( - data={ - "col_1": [random.randint(1, 80) for i in range(20)], - "col_2": [random_email() for i in range(20)], - "col_3": [random_cloud_region() for i in range(20)], - } - ) + mock_sample_values.return_value = { + "col_1": [random.randint(1, 80) for i in range(20)], + "col_2": [random_email() for i in range(20)], + "col_3": [random_cloud_region() for i in range(20)], + } datahub_classifier_config = DataHubClassifierConfig( minimum_values_threshold=10, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py index 427b6e562ebd1..75a9df4f28051 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py @@ -91,7 +91,8 @@ def test_snowflake_classification_perf(num_workers, num_cols_per_table, num_tabl source_report = pipeline.source.get_report() assert isinstance(source_report, SnowflakeV2Report) assert ( - cast(SnowflakeV2Report, source_report).num_tables_classified == num_tables + cast(SnowflakeV2Report, source_report).num_tables_classification_found + == num_tables ) assert ( len( diff --git a/metadata-ingestion/tests/unit/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift_query_mocker.py index 631e6e7ceaf1f..ada76e624032b 100644 --- a/metadata-ingestion/tests/unit/redshift_query_mocker.py +++ b/metadata-ingestion/tests/unit/redshift_query_mocker.py @@ -63,7 +63,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None: "\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[" "^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n " " query_text,\n row_number() over (\n partition " - "by TRIM(query_text)\n order by start_time desc\n ) rn\n " + "by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n " " from\n (\n select\n pid " "as session_id,\n xid as transaction_id,\n starttime " "as start_time,\n type,\n query_text,\n " @@ -94,7 +94,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None: "TABLE volt_tt_'\n -- We need to filter out our query and it was not possible " "earlier when we did not have any comment in the query\n and query_text not ilike " "'%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl" - "-statementtext%'\n\n )\n where\n rn = 1;\n " + "-statementtext%'\n\n )\n where\n rn = 1\n " ): mock_temp_table_cursor, "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor, } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage_query_log.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage_query_log.json new file mode 100644 index 0000000000000..e8e72bf25d303 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage_query_log.json @@ -0,0 +1,10 @@ +[ + { + "query": "create table foo as select a, b from bar", + "session_id": null, + "timestamp": null, + "user": null, + "default_db": "dev", + "default_schema": "public" + } +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json new file mode 100644 index 0000000000000..c4d3bee43faa1 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json @@ -0,0 +1,122 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)", + "type": "TRANSFORMED", + "query": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45" + }, + { + "auditStamp": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)", + "type": "TRANSFORMED", + "query": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD),a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),a)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD),b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),b)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD),c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),c)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45" + } + ] + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE TABLE #temp2 AS\nSELECT\n b,\n c\nFROM upstream2;\n\nCREATE TABLE #temp1 AS\nSELECT\n a,\n 2 * b AS b\nFROM upstream1;\n\nCREATE TEMPORARY TABLE staging_foo AS\nSELECT\n up1.a,\n up1.b,\n up2.c\nFROM #temp1 AS up1\nLEFT JOIN #temp2 AS up2\n ON up1.b = up2.b\nWHERE\n up1.b > 0;\n\nINSERT INTO prod_foo\nSELECT\n *\nFROM staging_foo", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + } + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json new file mode 100644 index 0000000000000..d2076aa1529d3 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json @@ -0,0 +1,231 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)", + "type": "TRANSFORMED", + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + }, + { + "auditStamp": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_returns,PROD)", + "type": "TRANSFORMED", + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + }, + { + "auditStamp": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_survey,PROD)", + "type": "TRANSFORMED", + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + }, + { + "auditStamp": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD)", + "type": "TRANSFORMED", + "query": "urn:li:query:composite_638945c382e30206a8f8a57894d375e5f6f2a3562fe68480badf37e38e836d75" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_id)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_638945c382e30206a8f8a57894d375e5f6f2a3562fe68480badf37e38e836d75" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_returns,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_id)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD),customer_email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_email)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD),return_date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_date)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_638945c382e30206a8f8a57894d375e5f6f2a3562fe68480badf37e38e836d75" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_returns,PROD),return_date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_date)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_survey,PROD),return_reason)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_reason)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17" + } + ] + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE TABLE #stage_online_returns AS\nSELECT\n online_ret.customer_id,\n customer.customer_email,\n online_ret.return_date,\n online_survey.return_reason\nFROM online_returns AS online_ret\nLEFT JOIN customer\n ON online_ret.customer_id = customer.customer_id\nLEFT JOIN online_survey\n ON online_ret.customer_id = online_survey.customer_id\n AND online_ret.return_id = online_survey.event_id;\n\nINSERT INTO all_returns (\n customer_id,\n customer_email,\n return_date,\n return_reason\n)\nSELECT\n customer_id,\n customer_email,\n return_date,\n return_reason\nFROM #stage_online_returns", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + } + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:composite_ad747ecae933492280d24dfa7f3a4ae3a3c67457e145803d05f7d8bd7efa7d17", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_returns,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_survey,PROD)" + } + ] + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:composite_638945c382e30206a8f8a57894d375e5f6f2a3562fe68480badf37e38e836d75", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE TABLE #stage_in_person_returns AS\nSELECT\n ipr.customer_id,\n customer.customer_email,\n ipr.return_date\nFROM in_person_returns AS ipr\nLEFT JOIN customer\n ON in_person_returns.customer_id = customer.customer_id;\n\nINSERT INTO all_returns (\n customer_id,\n customer_email,\n return_date\n)\nSELECT\n customer_id,\n customer_email,\n return_date\nFROM #stage_in_person_returns", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1707182625000, + "actor": "urn:li:corpuser:_ingestion" + } + } + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:composite_638945c382e30206a8f8a57894d375e5f6f2a3562fe68480badf37e38e836d75", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD)" + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index ab2fc0f005e76..826016d07317b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -13,6 +13,7 @@ from datahub.sql_parsing.sql_parsing_common import QueryType from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, ColumnRef from tests.test_helpers import mce_helpers +from tests.test_helpers.click_helpers import run_datahub_cmd RESOURCE_DIR = pathlib.Path(__file__).parent / "aggregator_goldens" FROZEN_TIME = "2024-02-06 01:23:45" @@ -23,12 +24,13 @@ def _ts(ts: int) -> datetime: @freeze_time(FROZEN_TIME) -def test_basic_lineage(pytestconfig: pytest.Config) -> None: +def test_basic_lineage(pytestconfig: pytest.Config, tmp_path: pathlib.Path) -> None: aggregator = SqlParsingAggregator( platform="redshift", generate_lineage=True, generate_usage_statistics=False, generate_operations=False, + query_log=QueryLogSetting.STORE_ALL, ) aggregator.add_observed_query( @@ -45,6 +47,23 @@ def test_basic_lineage(pytestconfig: pytest.Config) -> None: golden_path=RESOURCE_DIR / "test_basic_lineage.json", ) + # This test also validates the query log storage functionality. + aggregator.close() + query_log_db = aggregator.report.query_log_path + query_log_json = tmp_path / "query_log.json" + run_datahub_cmd( + [ + "check", + "extract-sql-agg-log", + str(query_log_db), + "--output", + str(query_log_json), + ] + ) + mce_helpers.check_golden_file( + pytestconfig, query_log_json, RESOURCE_DIR / "test_basic_lineage_query_log.json" + ) + @freeze_time(FROZEN_TIME) def test_overlapping_inserts(pytestconfig: pytest.Config) -> None: @@ -128,6 +147,127 @@ def test_temp_table(pytestconfig: pytest.Config) -> None: ) +@freeze_time(FROZEN_TIME) +def test_multistep_temp_table(pytestconfig: pytest.Config) -> None: + aggregator = SqlParsingAggregator( + platform="redshift", + generate_lineage=True, + generate_usage_statistics=False, + generate_operations=False, + ) + + aggregator.add_observed_query( + query="create table #temp1 as select a, 2*b as b from upstream1", + default_db="dev", + default_schema="public", + session_id="session1", + ) + aggregator.add_observed_query( + query="create table #temp2 as select b, c from upstream2", + default_db="dev", + default_schema="public", + session_id="session1", + ) + aggregator.add_observed_query( + query="create temp table staging_foo as select up1.a, up1.b, up2.c from #temp1 up1 left join #temp2 up2 on up1.b = up2.b where up1.b > 0", + default_db="dev", + default_schema="public", + session_id="session1", + ) + aggregator.add_observed_query( + query="insert into table prod_foo\nselect * from staging_foo", + default_db="dev", + default_schema="public", + session_id="session1", + ) + + mcps = list(aggregator.gen_metadata()) + + # Extra check to make sure that the report is populated correctly. + report = aggregator.report + assert len(report.queries_with_temp_upstreams) == 1 + assert ( + len( + report.queries_with_temp_upstreams[ + "composite_c89ee7c127c64a5d3a42ee875305087991891c80f42a25012910524bd2c77c45" + ] + ) + == 4 + ) + + mce_helpers.check_goldens_stream( + pytestconfig, + outputs=mcps, + golden_path=RESOURCE_DIR / "test_multistep_temp_table.json", + ) + + +@freeze_time(FROZEN_TIME) +def test_overlapping_inserts_from_temp_tables(pytestconfig: pytest.Config) -> None: + aggregator = SqlParsingAggregator( + platform="redshift", + generate_lineage=True, + generate_usage_statistics=False, + generate_operations=False, + ) + report = aggregator.report + + # The "all_returns" table is populated from "#stage_in_person_returns" and "#stage_online_returns". + # #stage_in_person_returns is populated from "in_person_returns" and "customer". + # #stage_online_returns is populated from "online_returns", "customer", and "online_survey". + + aggregator.add_observed_query( + query="create table #stage_in_person_returns as select ipr.customer_id, customer.customer_email, ipr.return_date " + "from in_person_returns ipr " + "left join customer on in_person_returns.customer_id = customer.customer_id", + default_db="dev", + default_schema="public", + session_id="1234", + ) + + aggregator.add_observed_query( + query="create table #stage_online_returns as select online_ret.customer_id, customer.customer_email, online_ret.return_date, online_survey.return_reason " + "from online_returns online_ret " + "left join customer on online_ret.customer_id = customer.customer_id " + "left join online_survey on online_ret.customer_id = online_survey.customer_id and online_ret.return_id = online_survey.event_id", + default_db="dev", + default_schema="public", + session_id="2323", + ) + + aggregator.add_observed_query( + query="insert into all_returns (customer_id, customer_email, return_date) select customer_id, customer_email, return_date from #stage_in_person_returns", + default_db="dev", + default_schema="public", + session_id="1234", + ) + + aggregator.add_observed_query( + query="insert into all_returns (customer_id, customer_email, return_date, return_reason) select customer_id, customer_email, return_date, return_reason from #stage_online_returns", + default_db="dev", + default_schema="public", + session_id="2323", + ) + + # We only have one create temp table, but the same insert command from multiple sessions. + # This should get ignored. + assert len(report.queries_with_non_authoritative_session) == 0 + aggregator.add_observed_query( + query="insert into all_returns (customer_id, customer_email, return_date, return_reason) select customer_id, customer_email, return_date, return_reason from #stage_online_returns", + default_db="dev", + default_schema="public", + session_id="5435", + ) + assert len(report.queries_with_non_authoritative_session) == 1 + + mcps = list(aggregator.gen_metadata()) + mce_helpers.check_goldens_stream( + pytestconfig, + outputs=mcps, + golden_path=RESOURCE_DIR / "test_overlapping_inserts_from_temp_tables.json", + ) + + @freeze_time(FROZEN_TIME) def test_aggregate_operations(pytestconfig: pytest.Config) -> None: aggregator = SqlParsingAggregator( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index 5fc3dfc779fa4..8aa27363e985d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -55,11 +55,13 @@ import org.opensearch.action.search.SearchResponse; import org.opensearch.client.RequestOptions; import org.opensearch.client.RestHighLevelClient; +import org.opensearch.common.lucene.search.function.CombineFunction; import org.opensearch.index.query.BoolQueryBuilder; import org.opensearch.index.query.QueryBuilder; import org.opensearch.index.query.QueryBuilders; import org.opensearch.search.SearchHit; import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.rescore.QueryRescorerBuilder; /** A search DAO for Elasticsearch backend. */ @Slf4j @@ -116,6 +118,9 @@ private SearchResponse executeSearchQuery( searchSourceBuilder.size(count); searchSourceBuilder.query(query); + if (graphQueryConfiguration.isBoostViaNodes()) { + addViaNodeBoostQuery(searchSourceBuilder); + } searchRequest.source(searchSourceBuilder); @@ -457,7 +462,7 @@ private List getLineageRelationships( } @VisibleForTesting - public static QueryBuilder getLineageQuery( + public QueryBuilder getLineageQuery( @Nonnull Map> urnsPerEntityType, @Nonnull Map> edgesPerEntityType, @Nonnull GraphFilters graphFilters, @@ -497,7 +502,7 @@ public static QueryBuilder getLineageQuery( } @VisibleForTesting - public static QueryBuilder getLineageQueryForEntityType( + public QueryBuilder getLineageQueryForEntityType( @Nonnull List urns, @Nonnull List lineageEdges, @Nonnull GraphFilters graphFilters) { @@ -520,6 +525,25 @@ public static QueryBuilder getLineageQueryForEntityType( return query; } + /** + * Replaces score from initial lineage query against the graph index with score from whether a via + * edge exists or not. We don't currently sort the results for the graph query for anything else, + * we just do a straight filter, but this will need to be re-evaluated if we do. + * + * @param sourceBuilder source builder for the lineage query + */ + private void addViaNodeBoostQuery(final SearchSourceBuilder sourceBuilder) { + QueryBuilders.functionScoreQuery(QueryBuilders.existsQuery(EDGE_FIELD_VIA)) + .boostMode(CombineFunction.REPLACE); + QueryRescorerBuilder queryRescorerBuilder = + new QueryRescorerBuilder( + QueryBuilders.functionScoreQuery(QueryBuilders.existsQuery(EDGE_FIELD_VIA)) + .boostMode(CombineFunction.REPLACE)); + queryRescorerBuilder.windowSize( + graphQueryConfiguration.getMaxResult()); // Will rescore all results + sourceBuilder.addRescorer(queryRescorerBuilder); + } + /** * Adds an individual relationship edge to a running set of unique paths to each node in the * graph. diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index 90f46190ac18e..0235edbcd30cb 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -36,12 +36,8 @@ import com.linkedin.structured.StructuredPropertyDefinition; import io.opentelemetry.extension.annotations.WithSpan; import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Base64; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -67,8 +63,6 @@ public class ElasticSearchGraphService implements GraphService, ElasticSearchInd private final ESGraphWriteDAO _graphWriteDAO; private final ESGraphQueryDAO _graphReadDAO; private final ESIndexBuilder _indexBuilder; - - private static final String DOC_DELIMETER = "--"; public static final String INDEX_NAME = "graph_service_v1"; private static final Map EMPTY_HASH = new HashMap<>(); @@ -123,25 +117,6 @@ private String toDocument(@Nonnull final Edge edge) { return searchDocument.toString(); } - private String toDocId(@Nonnull final Edge edge) { - String rawDocId = - edge.getSource().toString() - + DOC_DELIMETER - + edge.getRelationshipType() - + DOC_DELIMETER - + edge.getDestination().toString(); - - try { - byte[] bytesOfRawDocID = rawDocId.getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance("MD5"); - byte[] thedigest = md.digest(bytesOfRawDocID); - return Base64.getEncoder().encodeToString(thedigest); - } catch (NoSuchAlgorithmException e) { - e.printStackTrace(); - return rawDocId; - } - } - @Override public LineageRegistry getLineageRegistry() { return _lineageRegistry; @@ -149,7 +124,7 @@ public LineageRegistry getLineageRegistry() { @Override public void addEdge(@Nonnull final Edge edge) { - String docId = toDocId(edge); + String docId = edge.toDocId(); String edgeDocument = toDocument(edge); _graphWriteDAO.upsertDocument(docId, edgeDocument); } @@ -161,7 +136,7 @@ public void upsertEdge(@Nonnull final Edge edge) { @Override public void removeEdge(@Nonnull final Edge edge) { - String docId = toDocId(edge); + String docId = edge.toDocId(); _graphWriteDAO.deleteDocument(docId); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java index 2de61c8ed31bb..ca5fbfcd27a28 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java @@ -92,11 +92,25 @@ public int compare(RelatedEntity left, RelatedEntity right) { protected static String datasetFiveUrnString = "urn:li:" + datasetType + ":(urn:li:dataPlatform:type,SampleDatasetFive,PROD)"; + protected static final String schemaFieldUrnOneString = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:type,SampleDatasetFive,PROD),fieldOne)"; + protected static final String schemaFieldUrnTwoString = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:type,SampleDatasetFour,PROD),fieldTwo)"; + + protected static final String lifeCycleOwnerOneString = + "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)"; + protected static final String lifeCycleOwnerTwoString = + "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"; + protected static Urn datasetOneUrn = createFromString(datasetOneUrnString); protected static Urn datasetTwoUrn = createFromString(datasetTwoUrnString); protected static Urn datasetThreeUrn = createFromString(datasetThreeUrnString); protected static Urn datasetFourUrn = createFromString(datasetFourUrnString); protected static Urn datasetFiveUrn = createFromString(datasetFiveUrnString); + protected static final Urn schemaFieldUrnOne = createFromString(schemaFieldUrnOneString); + protected static final Urn schemaFieldUrnTwo = createFromString(schemaFieldUrnTwoString); + protected static final Urn lifeCycleOwnerOne = createFromString(lifeCycleOwnerOneString); + protected static final Urn lifeCycleOwnerTwo = createFromString(lifeCycleOwnerTwoString); protected static String unknownUrnString = "urn:li:unknown:(urn:li:unknown:Unknown)"; @@ -139,6 +153,14 @@ public int compare(RelatedEntity left, RelatedEntity right) { new RelatedEntity(downstreamOf, datasetThreeUrnString); protected static RelatedEntity downstreamOfDatasetFourRelatedEntity = new RelatedEntity(downstreamOf, datasetFourUrnString); + protected static final RelatedEntity downstreamOfSchemaFieldOneVia = + new RelatedEntity(downstreamOf, schemaFieldUrnOneString, lifeCycleOwnerOneString); + protected static final RelatedEntity downstreamOfSchemaFieldOne = + new RelatedEntity(downstreamOf, schemaFieldUrnOneString); + protected static final RelatedEntity downstreamOfSchemaFieldTwoVia = + new RelatedEntity(downstreamOf, schemaFieldUrnTwoString, lifeCycleOwnerOneString); + protected static final RelatedEntity downstreamOfSchemaFieldTwo = + new RelatedEntity(downstreamOf, schemaFieldUrnTwoString); protected static RelatedEntity hasOwnerDatasetOneRelatedEntity = new RelatedEntity(hasOwner, datasetOneUrnString); @@ -244,7 +266,29 @@ protected GraphService getPopulatedGraphService() throws Exception { new Edge(datasetThreeUrn, userTwoUrn, hasOwner, null, null, null, null, null), new Edge(datasetFourUrn, userTwoUrn, hasOwner, null, null, null, null, null), new Edge(userOneUrn, userTwoUrn, knowsUser, null, null, null, null, null), - new Edge(userTwoUrn, userOneUrn, knowsUser, null, null, null, null, null)); + new Edge(userTwoUrn, userOneUrn, knowsUser, null, null, null, null, null), + new Edge( + schemaFieldUrnOne, + schemaFieldUrnTwo, + downstreamOf, + 0L, + null, + 0L, + null, + null, + lifeCycleOwnerOne, + lifeCycleOwnerOne), + new Edge( + schemaFieldUrnOne, + schemaFieldUrnTwo, + downstreamOf, + 0L, + null, + 0L, + null, + null, + lifeCycleOwnerTwo, + null)); edges.forEach(service::addEdge); syncAfterWrite(); @@ -412,12 +456,14 @@ public void testPopulatedGraphService() throws Exception { outgoingRelationships, 0, 100); + // All downstreamOf, hasOwner, or knowsUser relationships, outgoing assertEqualsAnyOrder( relatedOutgoingEntitiesBeforeRemove, Arrays.asList( downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity, hasOwnerUserOneRelatedEntity, hasOwnerUserTwoRelatedEntity, - knowsUserOneRelatedEntity, knowsUserTwoRelatedEntity)); + knowsUserOneRelatedEntity, knowsUserTwoRelatedEntity, + downstreamOfSchemaFieldTwoVia, downstreamOfSchemaFieldTwo)); RelatedEntitiesResult relatedIncomingEntitiesBeforeRemove = service.findRelatedEntities( anyType, @@ -428,6 +474,7 @@ public void testPopulatedGraphService() throws Exception { incomingRelationships, 0, 100); + // All downstreamOf, hasOwner, or knowsUser relationships, incoming assertEqualsAnyOrder( relatedIncomingEntitiesBeforeRemove, Arrays.asList( @@ -439,7 +486,44 @@ public void testPopulatedGraphService() throws Exception { hasOwnerDatasetThreeRelatedEntity, hasOwnerDatasetFourRelatedEntity, knowsUserOneRelatedEntity, - knowsUserTwoRelatedEntity)); + knowsUserTwoRelatedEntity, + downstreamOfSchemaFieldOneVia, + downstreamOfSchemaFieldOne)); + EntityLineageResult viaNodeResult = + service.getLineage( + schemaFieldUrnOne, + LineageDirection.UPSTREAM, + new GraphFilters(List.of("schemaField")), + 0, + 1000, + 100, + null, + null); + // Multi-path enabled + assertEquals(viaNodeResult.getRelationships().size(), 2); + // First one is via node + assertTrue( + viaNodeResult.getRelationships().get(0).getPaths().get(0).contains(lifeCycleOwnerOne)); + EntityLineageResult viaNodeResultNoMulti = + getGraphService(false) + .getLineage( + schemaFieldUrnOne, + LineageDirection.UPSTREAM, + new GraphFilters(List.of("schemaField")), + 0, + 1000, + 100, + null, + null); + + // Multi-path disabled, still has two because via flow creates both edges in response + assertEquals(viaNodeResultNoMulti.getRelationships().size(), 2); + // First one is via node + assertTrue( + viaNodeResult.getRelationships().get(0).getPaths().get(0).contains(lifeCycleOwnerOne)); + + // reset graph service + getGraphService(); } @Test @@ -685,12 +769,18 @@ private void doTestFindRelatedEntities( @DataProvider(name = "FindRelatedEntitiesSourceTypeTests") public Object[][] getFindRelatedEntitiesSourceTypeTests() { return new Object[][] { + // All DownstreamOf relationships, outgoing new Object[] { null, Arrays.asList(downstreamOf), outgoingRelationships, - Arrays.asList(downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity) + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + downstreamOfSchemaFieldTwoVia, + downstreamOfSchemaFieldTwo) }, + // All DownstreamOf relationships, incoming new Object[] { null, Arrays.asList(downstreamOf), @@ -698,15 +788,20 @@ public Object[][] getFindRelatedEntitiesSourceTypeTests() { Arrays.asList( downstreamOfDatasetTwoRelatedEntity, downstreamOfDatasetThreeRelatedEntity, - downstreamOfDatasetFourRelatedEntity) + downstreamOfDatasetFourRelatedEntity, + downstreamOfSchemaFieldOneVia, + downstreamOfSchemaFieldOne) }, + // All DownstreamOf relationships, both directions new Object[] { null, Arrays.asList(downstreamOf), undirectedRelationships, Arrays.asList( downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity, - downstreamOfDatasetThreeRelatedEntity, downstreamOfDatasetFourRelatedEntity) + downstreamOfDatasetThreeRelatedEntity, downstreamOfDatasetFourRelatedEntity, + downstreamOfSchemaFieldTwoVia, downstreamOfSchemaFieldTwo, + downstreamOfSchemaFieldOneVia, downstreamOfSchemaFieldOne) }, // "" used to be any type before v0.9.0, which is now encoded by null @@ -789,16 +884,24 @@ public Object[][] getFindRelatedEntitiesDestinationTypeTests() { null, Arrays.asList(downstreamOf), outgoingRelationships, - Arrays.asList(downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity) + // All DownstreamOf relationships, outgoing + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + downstreamOfSchemaFieldTwoVia, + downstreamOfSchemaFieldTwo) }, new Object[] { null, Arrays.asList(downstreamOf), incomingRelationships, + // All DownstreamOf relationships, incoming Arrays.asList( downstreamOfDatasetTwoRelatedEntity, downstreamOfDatasetThreeRelatedEntity, - downstreamOfDatasetFourRelatedEntity) + downstreamOfDatasetFourRelatedEntity, + downstreamOfSchemaFieldOneVia, + downstreamOfSchemaFieldOne) }, new Object[] { null, @@ -806,7 +909,9 @@ public Object[][] getFindRelatedEntitiesDestinationTypeTests() { undirectedRelationships, Arrays.asList( downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity, - downstreamOfDatasetThreeRelatedEntity, downstreamOfDatasetFourRelatedEntity) + downstreamOfDatasetThreeRelatedEntity, downstreamOfDatasetFourRelatedEntity, + downstreamOfSchemaFieldOneVia, downstreamOfSchemaFieldOne, + downstreamOfSchemaFieldTwoVia, downstreamOfSchemaFieldTwo) }, new Object[] { "", Arrays.asList(downstreamOf), outgoingRelationships, Collections.emptyList() @@ -1035,12 +1140,14 @@ public void testFindRelatedEntitiesRelationshipTypes() throws Exception { outgoingRelationships, 0, 100); + // All DownstreamOf relationships, outgoing (destination) assertEqualsAnyOrder( allOutgoingRelatedEntities, Arrays.asList( downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity, hasOwnerUserOneRelatedEntity, hasOwnerUserTwoRelatedEntity, - knowsUserOneRelatedEntity, knowsUserTwoRelatedEntity)); + knowsUserOneRelatedEntity, knowsUserTwoRelatedEntity, + downstreamOfSchemaFieldTwoVia, downstreamOfSchemaFieldTwo)); RelatedEntitiesResult allIncomingRelatedEntities = service.findRelatedEntities( @@ -1052,6 +1159,7 @@ public void testFindRelatedEntitiesRelationshipTypes() throws Exception { incomingRelationships, 0, 100); + // All DownstreamOf relationships, incoming (source) assertEqualsAnyOrder( allIncomingRelatedEntities, Arrays.asList( @@ -1063,7 +1171,9 @@ public void testFindRelatedEntitiesRelationshipTypes() throws Exception { hasOwnerDatasetThreeRelatedEntity, hasOwnerDatasetFourRelatedEntity, knowsUserOneRelatedEntity, - knowsUserTwoRelatedEntity)); + knowsUserTwoRelatedEntity, + downstreamOfSchemaFieldOneVia, + downstreamOfSchemaFieldOne)); RelatedEntitiesResult allUnknownRelationshipTypeRelatedEntities = service.findRelatedEntities( @@ -1087,9 +1197,14 @@ public void testFindRelatedEntitiesRelationshipTypes() throws Exception { outgoingRelationships, 0, 100); + // All DownstreamOf relationships, outgoing (destination) assertEqualsAnyOrder( someUnknownRelationshipTypeRelatedEntities, - Arrays.asList(downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity)); + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + downstreamOfSchemaFieldTwoVia, + downstreamOfSchemaFieldTwo)); } @Test @@ -1517,6 +1632,7 @@ public void testRemoveNode() throws Exception { syncAfterWrite(); // assert the modified graph + // All downstreamOf, hasOwner, knowsUser relationships minus datasetTwo's, outgoing assertEqualsAnyOrder( service.findRelatedEntities( anyType, @@ -1529,7 +1645,8 @@ public void testRemoveNode() throws Exception { 100), Arrays.asList( hasOwnerUserOneRelatedEntity, hasOwnerUserTwoRelatedEntity, - knowsUserOneRelatedEntity, knowsUserTwoRelatedEntity)); + knowsUserOneRelatedEntity, knowsUserTwoRelatedEntity, + downstreamOfSchemaFieldTwoVia, downstreamOfSchemaFieldTwo)); } @Test diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBaseNoVia.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBaseNoVia.java new file mode 100644 index 0000000000000..19ca2e85e8c54 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBaseNoVia.java @@ -0,0 +1,386 @@ +package com.linkedin.metadata.graph; + +import static com.linkedin.metadata.search.utils.QueryUtils.*; +import static org.testng.Assert.*; + +import java.util.Arrays; +import java.util.Collections; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public abstract class GraphServiceTestBaseNoVia extends GraphServiceTestBase { + + @DataProvider(name = "NoViaFindRelatedEntitiesDestinationTypeTests") + public Object[][] getNoViaFindRelatedEntitiesDestinationTypeTests() { + return new Object[][] { + new Object[] { + null, + Arrays.asList(downstreamOf), + outgoingRelationships, + // All DownstreamOf relationships, outgoing + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + // TODO: Via not supported in Neo4J and DGraph + downstreamOfSchemaFieldTwo) + }, + new Object[] { + null, + Arrays.asList(downstreamOf), + incomingRelationships, + // All DownstreamOf relationships, incoming + Arrays.asList( + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity, + // TODO: Via not supported in Neo4J and DGraph + downstreamOfSchemaFieldOne) + }, + new Object[] { + null, + Arrays.asList(downstreamOf), + undirectedRelationships, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity, + // TODO: Via not supported in Neo4J and DGraph + downstreamOfSchemaFieldOne, + downstreamOfSchemaFieldTwo) + }, + new Object[] { + "", Arrays.asList(downstreamOf), outgoingRelationships, Collections.emptyList() + }, + new Object[] { + "", Arrays.asList(downstreamOf), incomingRelationships, Collections.emptyList() + }, + new Object[] { + "", Arrays.asList(downstreamOf), undirectedRelationships, Collections.emptyList() + }, + new Object[] { + datasetType, + Arrays.asList(downstreamOf), + outgoingRelationships, + Arrays.asList(downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity) + }, + new Object[] { + datasetType, + Arrays.asList(downstreamOf), + incomingRelationships, + Arrays.asList( + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity) + }, + new Object[] { + datasetType, + Arrays.asList(downstreamOf), + undirectedRelationships, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, downstreamOfDatasetFourRelatedEntity) + }, + new Object[] {datasetType, Arrays.asList(hasOwner), outgoingRelationships, Arrays.asList()}, + new Object[] { + datasetType, + Arrays.asList(hasOwner), + incomingRelationships, + Arrays.asList( + hasOwnerDatasetOneRelatedEntity, hasOwnerDatasetTwoRelatedEntity, + hasOwnerDatasetThreeRelatedEntity, hasOwnerDatasetFourRelatedEntity) + }, + new Object[] { + datasetType, + Arrays.asList(hasOwner), + undirectedRelationships, + Arrays.asList( + hasOwnerDatasetOneRelatedEntity, hasOwnerDatasetTwoRelatedEntity, + hasOwnerDatasetThreeRelatedEntity, hasOwnerDatasetFourRelatedEntity) + }, + new Object[] { + userType, + Arrays.asList(hasOwner), + outgoingRelationships, + Arrays.asList(hasOwnerUserOneRelatedEntity, hasOwnerUserTwoRelatedEntity) + }, + new Object[] {userType, Arrays.asList(hasOwner), incomingRelationships, Arrays.asList()}, + new Object[] { + userType, + Arrays.asList(hasOwner), + undirectedRelationships, + Arrays.asList(hasOwnerUserOneRelatedEntity, hasOwnerUserTwoRelatedEntity) + } + }; + } + + @DataProvider(name = "NoViaFindRelatedEntitiesSourceTypeTests") + public Object[][] getNoViaFindRelatedEntitiesSourceTypeTests() { + return new Object[][] { + // All DownstreamOf relationships, outgoing + new Object[] { + null, + Arrays.asList(downstreamOf), + outgoingRelationships, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + // TODO: DGraph and Neo4J do not support via + downstreamOfSchemaFieldTwo) + }, + // All DownstreamOf relationships, incoming + new Object[] { + null, + Arrays.asList(downstreamOf), + incomingRelationships, + Arrays.asList( + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity, + // TODO: DGraph and Neo4J do not support via + downstreamOfSchemaFieldOne) + }, + // All DownstreamOf relationships, both directions + new Object[] { + null, + Arrays.asList(downstreamOf), + undirectedRelationships, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity, + // TODO: DGraph and Neo4J do not support via + downstreamOfSchemaFieldTwo, + downstreamOfSchemaFieldOne) + }, + + // "" used to be any type before v0.9.0, which is now encoded by null + new Object[] { + "", Arrays.asList(downstreamOf), outgoingRelationships, Collections.emptyList() + }, + new Object[] { + "", Arrays.asList(downstreamOf), incomingRelationships, Collections.emptyList() + }, + new Object[] { + "", Arrays.asList(downstreamOf), undirectedRelationships, Collections.emptyList() + }, + new Object[] { + datasetType, + Arrays.asList(downstreamOf), + outgoingRelationships, + Arrays.asList(downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity) + }, + new Object[] { + datasetType, + Arrays.asList(downstreamOf), + incomingRelationships, + Arrays.asList( + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity) + }, + new Object[] { + datasetType, + Arrays.asList(downstreamOf), + undirectedRelationships, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, downstreamOfDatasetFourRelatedEntity) + }, + new Object[] {userType, Arrays.asList(downstreamOf), outgoingRelationships, Arrays.asList()}, + new Object[] {userType, Arrays.asList(downstreamOf), incomingRelationships, Arrays.asList()}, + new Object[] { + userType, Arrays.asList(downstreamOf), undirectedRelationships, Arrays.asList() + }, + new Object[] {userType, Arrays.asList(hasOwner), outgoingRelationships, Arrays.asList()}, + new Object[] { + userType, + Arrays.asList(hasOwner), + incomingRelationships, + Arrays.asList( + hasOwnerDatasetOneRelatedEntity, hasOwnerDatasetTwoRelatedEntity, + hasOwnerDatasetThreeRelatedEntity, hasOwnerDatasetFourRelatedEntity) + }, + new Object[] { + userType, + Arrays.asList(hasOwner), + undirectedRelationships, + Arrays.asList( + hasOwnerDatasetOneRelatedEntity, hasOwnerDatasetTwoRelatedEntity, + hasOwnerDatasetThreeRelatedEntity, hasOwnerDatasetFourRelatedEntity) + } + }; + } + + @Test + @Override + public void testFindRelatedEntitiesRelationshipTypes() throws Exception { + GraphService service = getPopulatedGraphService(); + + RelatedEntitiesResult allOutgoingRelatedEntities = + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList(downstreamOf, hasOwner, knowsUser), + outgoingRelationships, + 0, + 100); + // All DownstreamOf relationships, outgoing (destination) + assertEqualsAnyOrder( + allOutgoingRelatedEntities, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + hasOwnerUserOneRelatedEntity, + hasOwnerUserTwoRelatedEntity, + knowsUserOneRelatedEntity, + knowsUserTwoRelatedEntity, + // TODO: DGraph and Neo4J do not support via + downstreamOfSchemaFieldTwo)); + + RelatedEntitiesResult allIncomingRelatedEntities = + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList(downstreamOf, hasOwner, knowsUser), + incomingRelationships, + 0, + 100); + // All DownstreamOf relationships, incoming (source) + assertEqualsAnyOrder( + allIncomingRelatedEntities, + Arrays.asList( + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity, + hasOwnerDatasetOneRelatedEntity, + hasOwnerDatasetTwoRelatedEntity, + hasOwnerDatasetThreeRelatedEntity, + hasOwnerDatasetFourRelatedEntity, + knowsUserOneRelatedEntity, + knowsUserTwoRelatedEntity, + // TODO: DGraph and Neo4J do not support via + downstreamOfSchemaFieldOne)); + + RelatedEntitiesResult allUnknownRelationshipTypeRelatedEntities = + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList("unknownRelationshipType", "unseenRelationshipType"), + outgoingRelationships, + 0, + 100); + assertEqualsAnyOrder(allUnknownRelationshipTypeRelatedEntities, Collections.emptyList()); + + RelatedEntitiesResult someUnknownRelationshipTypeRelatedEntities = + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList("unknownRelationshipType", downstreamOf), + outgoingRelationships, + 0, + 100); + // All DownstreamOf relationships, outgoing (destination) + assertEqualsAnyOrder( + someUnknownRelationshipTypeRelatedEntities, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + // TODO: DGraph and Neo4J do not support via + downstreamOfSchemaFieldTwo)); + } + + @Test + @Override + public void testPopulatedGraphService() throws Exception { + GraphService service = getPopulatedGraphService(); + + RelatedEntitiesResult relatedOutgoingEntitiesBeforeRemove = + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList(downstreamOf, hasOwner, knowsUser), + outgoingRelationships, + 0, + 100); + // All downstreamOf, hasOwner, or knowsUser relationships, outgoing + assertEqualsAnyOrder( + relatedOutgoingEntitiesBeforeRemove, + Arrays.asList( + downstreamOfDatasetOneRelatedEntity, + downstreamOfDatasetTwoRelatedEntity, + hasOwnerUserOneRelatedEntity, + hasOwnerUserTwoRelatedEntity, + knowsUserOneRelatedEntity, + knowsUserTwoRelatedEntity, + // TODO: DGraph and Neo4j do not support via + downstreamOfSchemaFieldTwo)); + RelatedEntitiesResult relatedIncomingEntitiesBeforeRemove = + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList(downstreamOf, hasOwner, knowsUser), + incomingRelationships, + 0, + 100); + // All downstreamOf, hasOwner, or knowsUser relationships, incoming + assertEqualsAnyOrder( + relatedIncomingEntitiesBeforeRemove, + Arrays.asList( + downstreamOfDatasetTwoRelatedEntity, + downstreamOfDatasetThreeRelatedEntity, + downstreamOfDatasetFourRelatedEntity, + hasOwnerDatasetOneRelatedEntity, + hasOwnerDatasetTwoRelatedEntity, + hasOwnerDatasetThreeRelatedEntity, + hasOwnerDatasetFourRelatedEntity, + knowsUserOneRelatedEntity, + knowsUserTwoRelatedEntity, + // TODO: DGraph and Neo4j do not support via + downstreamOfSchemaFieldOne)); + // TODO: DGraph and Neo4j do not support via + // No checking of split via edge + } + + @Test + @Override + public void testRemoveNode() throws Exception { + GraphService service = getPopulatedGraphService(); + + service.removeNode(datasetTwoUrn); + syncAfterWrite(); + + // assert the modified graph + // All downstreamOf, hasOwner, knowsUser relationships minus datasetTwo's, outgoing + assertEqualsAnyOrder( + service.findRelatedEntities( + anyType, + EMPTY_FILTER, + anyType, + EMPTY_FILTER, + Arrays.asList(downstreamOf, hasOwner, knowsUser), + outgoingRelationships, + 0, + 100), + Arrays.asList( + hasOwnerUserOneRelatedEntity, + hasOwnerUserTwoRelatedEntity, + knowsUserOneRelatedEntity, + knowsUserTwoRelatedEntity, + // TODO: DGraph and Neo4j do not support via + downstreamOfSchemaFieldTwo)); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java index 1ccf018a74c3a..7e683502dd958 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java @@ -9,11 +9,12 @@ import com.google.common.collect.ImmutableList; import com.linkedin.metadata.graph.GraphService; -import com.linkedin.metadata.graph.GraphServiceTestBase; +import com.linkedin.metadata.graph.GraphServiceTestBaseNoVia; import com.linkedin.metadata.graph.RelatedEntity; import com.linkedin.metadata.models.registry.LineageRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.query.filter.RelationshipDirection; +import com.linkedin.metadata.query.filter.RelationshipFilter; import io.dgraph.DgraphClient; import io.dgraph.DgraphGrpc; import io.grpc.CallOptions; @@ -28,6 +29,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import javax.annotation.Nonnull; @@ -41,7 +43,7 @@ @SuppressWarnings("ArraysAsListWithZeroOrOneArgument") @Slf4j -public class DgraphGraphServiceTest extends GraphServiceTestBase { +public class DgraphGraphServiceTest extends GraphServiceTestBaseNoVia { private ManagedChannel _channel; private DgraphGraphService _service; @@ -823,4 +825,28 @@ public void testGetDestinationUrnsFromResponseData() { public void testPopulatedGraphServiceGetLineageMultihop(boolean attemptMultiHop) { // TODO: Remove this overridden method once the multihop for dGraph is implemented! } + + @Override + @Test(dataProvider = "NoViaFindRelatedEntitiesDestinationTypeTests") + public void testFindRelatedEntitiesDestinationType( + String datasetType, + List relationshipTypes, + RelationshipFilter relationships, + List expectedRelatedEntities) + throws Exception { + super.testFindRelatedEntitiesDestinationType( + datasetType, relationshipTypes, relationships, expectedRelatedEntities); + } + + @Override + @Test(dataProvider = "NoViaFindRelatedEntitiesSourceTypeTests") + public void testFindRelatedEntitiesSourceType( + String datasetType, + List relationshipTypes, + RelationshipFilter relationships, + List expectedRelatedEntities) + throws Exception { + super.testFindRelatedEntitiesSourceType( + datasetType, relationshipTypes, relationships, expectedRelatedEntities); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java index f1113368601c6..cff79618b8e09 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java @@ -12,6 +12,7 @@ import com.linkedin.metadata.graph.EntityLineageResult; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.graph.GraphServiceTestBase; +import com.linkedin.metadata.graph.GraphServiceTestBaseNoVia; import com.linkedin.metadata.graph.LineageDirection; import com.linkedin.metadata.graph.RelatedEntitiesResult; import com.linkedin.metadata.graph.RelatedEntity; @@ -35,7 +36,7 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -public class Neo4jGraphServiceTest extends GraphServiceTestBase { +public class Neo4jGraphServiceTest extends GraphServiceTestBaseNoVia { private Neo4jTestServerBuilder _serverBuilder; private Driver _driver; @@ -90,6 +91,7 @@ protected void assertEqualsAnyOrder( } @Override + @Test(dataProvider = "NoViaFindRelatedEntitiesSourceTypeTests") public void testFindRelatedEntitiesSourceType( String datasetType, List relationshipTypes, @@ -110,6 +112,7 @@ public void testFindRelatedEntitiesSourceType( } @Override + @Test(dataProvider = "NoViaFindRelatedEntitiesDestinationTypeTests") public void testFindRelatedEntitiesDestinationType( String datasetType, List relationshipTypes, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java index 5b7f880e6d83a..8ae2725b749d1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java @@ -7,6 +7,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.metadata.Constants; +import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.metadata.graph.GraphFilters; import com.linkedin.metadata.graph.elastic.ESGraphQueryDAO; import com.linkedin.metadata.models.registry.LineageRegistry; @@ -99,19 +100,21 @@ private static void testGetQueryForLineageFullArguments() throws Exception { Long startTime = 0L; Long endTime = 1L; + ESGraphQueryDAO graphQueryDAO = + new ESGraphQueryDAO(null, null, null, new GraphQueryConfiguration()); QueryBuilder limitedBuilder = - ESGraphQueryDAO.getLineageQueryForEntityType(urns, edgeInfos, graphFilters); + graphQueryDAO.getLineageQueryForEntityType(urns, edgeInfos, graphFilters); QueryBuilder fullBuilder = - ESGraphQueryDAO.getLineageQuery( + graphQueryDAO.getLineageQuery( urnsPerEntityType, edgesPerEntityType, graphFilters, startTime, endTime); QueryBuilder fullBuilderEmptyFilters = - ESGraphQueryDAO.getLineageQuery( + graphQueryDAO.getLineageQuery( urnsPerEntityType, edgesPerEntityType, GraphFilters.emptyGraphFilters, null, null); QueryBuilder fullBuilderMultipleFilters = - ESGraphQueryDAO.getLineageQuery( + graphQueryDAO.getLineageQuery( urnsPerEntityTypeMultiple, edgesPerEntityTypeMultiple, graphFiltersMultiple, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java index 71f247ebfc29a..8c184055a6b0d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java @@ -20,7 +20,10 @@ import com.linkedin.metadata.graph.elastic.ESGraphQueryDAO; import com.linkedin.metadata.graph.elastic.ESGraphWriteDAO; import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; +import com.linkedin.metadata.models.registry.ConfigEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.models.registry.LineageRegistry; +import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipDirection; @@ -30,6 +33,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import io.datahubproject.test.search.SearchTestUtils; +import io.datahubproject.test.search.config.SearchCommonTestConfiguration; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; @@ -76,7 +80,20 @@ public void wipe() throws Exception { @Nonnull private ElasticSearchGraphService buildService(boolean enableMultiPathSearch) { - LineageRegistry lineageRegistry = new LineageRegistry(SnapshotEntityRegistry.getInstance()); + ConfigEntityRegistry configEntityRegistry = + new ConfigEntityRegistry( + SearchCommonTestConfiguration.class + .getClassLoader() + .getResourceAsStream("entity-registry.yml")); + SnapshotEntityRegistry snapshotEntityRegistry = SnapshotEntityRegistry.getInstance(); + LineageRegistry lineageRegistry; + try { + MergedEntityRegistry mergedEntityRegistry = + new MergedEntityRegistry(snapshotEntityRegistry).apply(configEntityRegistry); + lineageRegistry = new LineageRegistry(mergedEntityRegistry); + } catch (EntityRegistryException e) { + throw new RuntimeException(e); + } GraphQueryConfiguration configuration = GraphQueryConfiguration.testDefaults; configuration.setEnableMultiPathSearch(enableMultiPathSearch); ESGraphQueryDAO readDAO = diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java index a26c886c6eaf7..8f752376ef2d9 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java @@ -108,7 +108,10 @@ public boolean isEnabled() { public void invoke(@Nonnull MetadataChangeLog event) { if (enabled && isEligibleForProcessing(event)) { - log.info("Urn {} received by Sibling Hook.", event.getEntityUrn()); + log.info( + "Urn {} with aspect {} received by Sibling Hook.", + event.getEntityUrn(), + event.getAspectName()); final Urn urn = getUrnFromEvent(event); diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java index cae67108b4ca0..ddfcf4b72776e 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java @@ -140,10 +140,23 @@ public void testFineGrainedLineageEdgesAreAdded() throws Exception { Urn downstreamUrn = UrnUtils.getUrn( "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD),field_foo)"); + Urn lifeCycleOwner = + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)"); MetadataChangeLog event = createUpstreamLineageMCL(upstreamUrn, downstreamUrn); updateIndicesHook.invoke(event); - Edge edge = new Edge(downstreamUrn, upstreamUrn, DOWNSTREAM_OF, null, null, null, null, null); + Edge edge = + new Edge( + downstreamUrn, + upstreamUrn, + DOWNSTREAM_OF, + null, + null, + null, + null, + null, + lifeCycleOwner, + null); Mockito.verify(mockGraphService, Mockito.times(1)).addEdge(Mockito.eq(edge)); Mockito.verify(mockGraphService, Mockito.times(1)) .removeEdgesFromNode( @@ -164,11 +177,24 @@ public void testFineGrainedLineageEdgesAreAddedRestate() throws Exception { Urn downstreamUrn = UrnUtils.getUrn( "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD),field_foo)"); + Urn lifeCycleOwner = + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)"); MetadataChangeLog event = createUpstreamLineageMCL(upstreamUrn, downstreamUrn, ChangeType.RESTATE); updateIndicesHook.invoke(event); - Edge edge = new Edge(downstreamUrn, upstreamUrn, DOWNSTREAM_OF, null, null, null, null, null); + Edge edge = + new Edge( + downstreamUrn, + upstreamUrn, + DOWNSTREAM_OF, + null, + null, + null, + null, + null, + lifeCycleOwner, + null); Mockito.verify(mockGraphService, Mockito.times(1)).addEdge(Mockito.eq(edge)); Mockito.verify(mockGraphService, Mockito.times(1)) .removeEdgesFromNode( diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java index 4da50f47e2feb..cd869a61bf3ab 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java @@ -12,6 +12,12 @@ public class GraphQueryConfiguration { // will return all paths between the source and destination nodes within the hops limit. private boolean enableMultiPathSearch; + /** + * Adds a boosting query for via nodes being present on a lineage search hit, allows these nodes + * to be prioritized in the case of a multiple path situation with multi-path search disabled + */ + private boolean boostViaNodes; + public static GraphQueryConfiguration testDefaults; static { @@ -20,5 +26,6 @@ public class GraphQueryConfiguration { testDefaults.setTimeoutSeconds(10); testDefaults.setMaxResult(10000); testDefaults.setEnableMultiPathSearch(true); + testDefaults.setBoostViaNodes(true); } } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 467b1cf109dee..c0f82d8536922 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -236,7 +236,8 @@ elasticsearch: timeoutSeconds: ${ELASTICSEARCH_SEARCH_GRAPH_TIMEOUT_SECONDS:50} # graph dao timeout seconds batchSize: ${ELASTICSEARCH_SEARCH_GRAPH_BATCH_SIZE:1000} # graph dao batch size maxResult: ${ELASTICSEARCH_SEARCH_GRAPH_MAX_RESULT:10000} # graph dao max result size - enableMultiPathSearch: ${ELASTICSEARCH_SEARCH_GRAPH_MULTI_PATH_SEARCH:false} + enableMultiPathSearch: ${ELASTICSEARCH_SEARCH_GRAPH_MULTI_PATH_SEARCH:false} # allows a path to be retraversed to walk all paths to the node instead of just shortest, avoids cycles by not rewalking the visited edge + boostViaNodes: ${ELASTICSEARCH_SEARCH_GRAPH_BOOST_VIA_NODES:true} # adds a boosting query that ranks graph edges with via nodes higher, used to allow via paths to be prioritized when multi path search is disabled # TODO: Kafka topic convention kafka: diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 145ec7e65188c..b250435b4a642 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -65,5 +65,4 @@ dependencies { configurations.all{ exclude group: "commons-io", module:"commons-io" exclude group: "jline", module:"jline" - exclude group: 'software.amazon.awssdk', module: 'third-party-jackson-core' } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java index 3550a86163f51..09bd9f8bb09e5 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java @@ -9,6 +9,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.metadata.authorization.PoliciesConfig; +import com.linkedin.metadata.graph.Edge; import com.linkedin.metadata.graph.RelatedEntities; import com.linkedin.metadata.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; @@ -49,16 +50,14 @@ name = "Generic Relationships", description = "APIs for ingesting and accessing entity relationships.") public class RelationshipController { - - private static final String[] SORT_FIELDS = {"source.urn", "destination.urn", "relationshipType"}; - private static final String[] SORT_ORDERS = {"ASCENDING", "ASCENDING", "ASCENDING"}; + private static final String[] SORT_ORDERS = {"ASCENDING", "ASCENDING", "ASCENDING", "ASCENDING"}; private static final List EDGE_SORT_CRITERION; static { EDGE_SORT_CRITERION = - IntStream.range(0, SORT_FIELDS.length) + IntStream.range(0, Edge.KEY_FIELDS.length) .mapToObj( - idx -> SearchUtil.sortBy(SORT_FIELDS[idx], SortOrder.valueOf(SORT_ORDERS[idx]))) + idx -> SearchUtil.sortBy(Edge.KEY_FIELDS[idx], SortOrder.valueOf(SORT_ORDERS[idx]))) .collect(Collectors.toList()); } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java index 458b23317c6c8..cb74ae5acd6a6 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java @@ -1,13 +1,20 @@ package com.linkedin.metadata.graph; import com.linkedin.common.urn.Urn; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Base64; import java.util.Map; import lombok.AllArgsConstructor; import lombok.Data; import lombok.EqualsAndHashCode; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; @Data @AllArgsConstructor +@Slf4j public class Edge { @EqualsAndHashCode.Include private Urn source; @EqualsAndHashCode.Include private Urn destination; @@ -18,7 +25,7 @@ public class Edge { @EqualsAndHashCode.Exclude private Urn updatedActor; @EqualsAndHashCode.Exclude private Map properties; // The entity who owns the lifecycle of this edge - @EqualsAndHashCode.Exclude private Urn lifecycleOwner; + @EqualsAndHashCode.Include private Urn lifecycleOwner; // An entity through which the edge between source and destination is created @EqualsAndHashCode.Include private Urn via; @@ -44,4 +51,32 @@ public Edge( null, null); } + + public String toDocId() { + StringBuilder rawDocId = new StringBuilder(); + rawDocId + .append(getSource().toString()) + .append(DOC_DELIMETER) + .append(getRelationshipType()) + .append(DOC_DELIMETER) + .append(getDestination().toString()); + if (getLifecycleOwner() != null && StringUtils.isNotBlank(getLifecycleOwner().toString())) { + rawDocId.append(DOC_DELIMETER).append(getLifecycleOwner().toString()); + } + + try { + byte[] bytesOfRawDocID = rawDocId.toString().getBytes(StandardCharsets.UTF_8); + MessageDigest md = MessageDigest.getInstance("MD5"); + byte[] thedigest = md.digest(bytesOfRawDocID); + return Base64.getEncoder().encodeToString(thedigest); + } catch (NoSuchAlgorithmException e) { + log.error("Unable to hash document ID, returning unhashed id: " + rawDocId); + return rawDocId.toString(); + } + } + + public static final String[] KEY_FIELDS = { + "source.urn", "destination.urn", "relationshipType", "lifeCycleOwner" + }; + private static final String DOC_DELIMETER = "--"; }