diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 2c931162fb006c..ab5b3eb48da7f3 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -40,19 +40,19 @@ jobs: extra_pip_requirements: "apache-airflow~=2.2.4" extra_pip_extras: plugin-v1 - python-version: "3.10" - extra_pip_requirements: "apache-airflow==2.4.3" + extra_pip_requirements: "apache-airflow~=2.4.3" extra_pip_extras: plugin-v2,test-airflow24 - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.6.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt' + extra_pip_requirements: 'apache-airflow~=2.6.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.7.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt' + extra_pip_requirements: 'apache-airflow~=2.7.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.8.1 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt' + extra_pip_requirements: 'apache-airflow~=2.8.1 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt' extra_pip_extras: plugin-v2 - - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.9.0 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.10.txt' + - python-version: "3.11" + extra_pip_requirements: 'apache-airflow~=2.9.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.10.txt' extra_pip_extras: plugin-v2 fail-fast: false steps: diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java index 542745e0148628..3617eb47259797 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java @@ -255,4 +255,19 @@ public static Filter viewFilter( Filter result = SearchUtils.combineFilters(null, viewInfo.getDefinition().getFilter()); return result; } + + /** + * Simply resolves the end time filter for the search across lineage query. If the start time is + * provided, but end time is not provided, we will default to the current time. + */ + public static Long getLineageEndTimeMillis( + @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { + if (endTimeMillis != null) { + return endTimeMillis; + } + if (startTimeMillis != null) { + return System.currentTimeMillis(); + } + return null; + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java index 51b00bbe7b799b..d872ffad2783db 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.LineageInput; import com.linkedin.datahub.graphql.generated.LineageRelationship; import com.linkedin.datahub.graphql.generated.Restricted; +import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.metadata.graph.SiblingGraphService; import graphql.schema.DataFetcher; @@ -63,7 +64,10 @@ public CompletableFuture get(DataFetchingEnvironment enviro @Nullable final Integer count = input.getCount(); // Optional! @Nullable final Boolean separateSiblings = input.getSeparateSiblings(); // Optional! @Nullable final Long startTimeMillis = input.getStartTimeMillis(); // Optional! - @Nullable final Long endTimeMillis = input.getEndTimeMillis(); // Optional! + @Nullable + final Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis( + input.getStartTimeMillis(), input.getEndTimeMillis()); // Optional! com.linkedin.metadata.graph.LineageDirection resolvedDirection = com.linkedin.metadata.graph.LineageDirection.valueOf(lineageDirection.toString()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java index 14b2d3b8f8420c..1b719b6f786205 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java @@ -78,7 +78,8 @@ public CompletableFuture get(DataFetchingEnvironment @Nullable Long startTimeMillis = input.getStartTimeMillis() == null ? null : input.getStartTimeMillis(); @Nullable - Long endTimeMillis = input.getEndTimeMillis() == null ? null : input.getEndTimeMillis(); + Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis(input.getStartTimeMillis(), input.getEndTimeMillis()); final LineageFlags lineageFlags = LineageFlagsInputMapper.map(context, input.getLineageFlags()); if (lineageFlags.getStartTimeMillis() == null && startTimeMillis != null) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java index f342d251acd725..dc3a1fc17e4ec9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java @@ -111,7 +111,8 @@ public CompletableFuture get(DataFetchingEnvironment @Nullable Long startTimeMillis = input.getStartTimeMillis() == null ? null : input.getStartTimeMillis(); @Nullable - Long endTimeMillis = input.getEndTimeMillis() == null ? null : input.getEndTimeMillis(); + Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis(input.getStartTimeMillis(), input.getEndTimeMillis()); final LineageFlags lineageFlags = LineageFlagsInputMapper.map(context, input.getLineageFlags()); if (lineageFlags.getStartTimeMillis() == null && startTimeMillis != null) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java index 43c24c9630d646..87664ef2af4c74 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java @@ -6,6 +6,7 @@ import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.EntityTypeToPlatforms; import com.linkedin.datahub.graphql.generated.LineageFlags; +import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.entitytype.EntityTypeMapper; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; import java.util.Collections; @@ -42,12 +43,16 @@ public com.linkedin.metadata.query.LineageFlags apply( if (lineageFlags.getIgnoreAsHops() != null) { result.setIgnoreAsHops(mapIgnoreAsHops(lineageFlags.getIgnoreAsHops())); } - if (lineageFlags.getEndTimeMillis() != null) { - result.setEndTimeMillis(lineageFlags.getEndTimeMillis()); - } if (lineageFlags.getStartTimeMillis() != null) { result.setStartTimeMillis(lineageFlags.getStartTimeMillis()); } + // Default to "now" if no end time is provided, but start time is provided. + Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis( + lineageFlags.getStartTimeMillis(), lineageFlags.getEndTimeMillis()); + if (endTimeMillis != null) { + result.setEndTimeMillis(endTimeMillis); + } if (lineageFlags.getEntitiesExploredPerHopLimit() != null) { result.setEntitiesExploredPerHopLimit(lineageFlags.getEntitiesExploredPerHopLimit()); } diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index e6db6bfcc9a61b..21a9be9dfb386a 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -32,6 +32,7 @@ import { INGESTION_REFRESH_SOURCES_ID, } from '../../onboarding/config/IngestionOnboardingConfig'; import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants'; +import { useCommandS } from './hooks'; const PLACEHOLDER_URN = 'placeholder-urn'; @@ -51,6 +52,8 @@ const FilterWrapper = styled.div` display: flex; `; +const SYSTEM_INTERNAL_SOURCE_TYPE = 'SYSTEM'; + export enum IngestionSourceType { ALL, UI, @@ -102,6 +105,17 @@ export const IngestionSourceList = () => { // Set of removed urns used to account for eventual consistency const [removedUrns, setRemovedUrns] = useState([]); const [sourceFilter, setSourceFilter] = useState(IngestionSourceType.ALL); + const [hideSystemSources, setHideSystemSources] = useState(true); + + /** + * Show or hide system ingestion sources using a hidden command S command. + */ + useCommandS(() => setHideSystemSources(!hideSystemSources)); + + // Ingestion Source Default Filters + const filters = hideSystemSources + ? [{ field: 'sourceType', values: [SYSTEM_INTERNAL_SOURCE_TYPE], negated: true }] + : undefined; // Ingestion Source Queries const { loading, error, data, client, refetch } = useListIngestionSourcesQuery({ @@ -110,6 +124,7 @@ export const IngestionSourceList = () => { start, count: pageSize, query: (query?.length && query) || undefined, + filters, }, }, fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', diff --git a/datahub-web-react/src/app/ingest/source/hooks.ts b/datahub-web-react/src/app/ingest/source/hooks.ts new file mode 100644 index 00000000000000..7197c9daffa9ca --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/hooks.ts @@ -0,0 +1,16 @@ +import { useEffect } from 'react'; + +export const useCommandS = (onPress: () => void) => { + useEffect(() => { + const handleKeyDown = (event: KeyboardEvent) => { + if (event.metaKey && event.key === 's') { + event.preventDefault(); + onPress(); + } + }; + window.addEventListener('keydown', handleKeyDown); + return () => { + window.removeEventListener('keydown', handleKeyDown); + }; + }, [onPress]); +}; diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 49abcdcf244759..0ac784bf29d2ef 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -76,6 +76,12 @@ module.exports = { label: "Docs", position: "right", }, + { + to: "/learn", + activeBasePath: "learn", + label: "Learn", + position: "right", + }, { to: "/integrations", activeBasePath: "integrations", @@ -299,7 +305,15 @@ module.exports = { showLastUpdateAuthor: false, showLastUpdateTime: false, }, - blog: false, + blog: { + blogTitle: "DataHub Learn", + blogSidebarTitle: "DataHub Learn", + blogDescription: "Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.", + path: "src/learn", + routeBasePath: "learn", + postsPerPage: "ALL", + blogListComponent: "../src/learn/_components/LearnListPage", + }, theme: { customCss: [ isSaas ? require.resolve("./src/styles/acryl.scss") : require.resolve("./src/styles/datahub.scss"), diff --git a/docs-website/src/learn/_components/LearnItemCard/index.jsx b/docs-website/src/learn/_components/LearnItemCard/index.jsx new file mode 100644 index 00000000000000..9c6b6cfdc98d87 --- /dev/null +++ b/docs-website/src/learn/_components/LearnItemCard/index.jsx @@ -0,0 +1,30 @@ +import React from "react"; +import clsx from "clsx"; +import Link from "@docusaurus/Link"; +import { useBlogPost } from "@docusaurus/theme-common/internal"; +import styles from "./styles.module.scss"; + +export default function LearnItemCard() { + const { metadata } = useBlogPost(); + const { permalink, title, description, formattedDate, frontMatter } = metadata; + return ( +
+ + {frontMatter?.image ? ( +
+ {title} +
+ ) : ( +
+

{title}

+
+ )} +
+
{description}
+
+ +
Published on {formattedDate}
+ +
+ ); +} \ No newline at end of file diff --git a/docs-website/src/learn/_components/LearnItemCard/styles.module.scss b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss new file mode 100644 index 00000000000000..2bfaabdc06d498 --- /dev/null +++ b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss @@ -0,0 +1,53 @@ +.featureCol { + display: flex; +} + +.card_date { + padding: 1rem 2rem; + font-size: 0.8rem; + font-style: italic; + color: gray; + margin-top: auto; +} + +.card_feature { + font-size: 2rem; + font-weight: 700; +} + +.card { + color: var(--ifm-text-color); + text-decoration: none !important; + padding: 0rem; + margin-bottom: 2rem; + align-self: stretch; + flex-grow: 1; + &:hover { + border-color: var(--ifm-color-primary); + } + hr { + margin: 0; + } +} + +.featureHeader { + h2 { + margin-bottom: 1rem !important; + font-size: 1.25rem; + } + padding: 1rem 2rem; +} + +.featureBody { + padding: 0 2rem; +} + +.card_image { + margin: 0; + margin-bottom: 0.5rem; + + img { + width: 100%; + height: auto; + } +} \ No newline at end of file diff --git a/docs-website/src/learn/_components/LearnListPage/index.jsx b/docs-website/src/learn/_components/LearnListPage/index.jsx new file mode 100644 index 00000000000000..4df87a340f21ee --- /dev/null +++ b/docs-website/src/learn/_components/LearnListPage/index.jsx @@ -0,0 +1,91 @@ +import React, { useState } from "react"; +import clsx from "clsx"; + +import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; +import { PageMetadata, HtmlClassNameProvider, ThemeClassNames } from "@docusaurus/theme-common"; +import BlogListPaginator from "@theme/BlogListPaginator"; +import SearchMetadata from "@theme/SearchMetadata"; +import { BlogPostProvider } from "@docusaurus/theme-common/internal"; +import LearnItemCard from "../LearnItemCard"; +import Layout from "@theme/Layout"; +import styles from "./styles.module.scss"; + +function BlogListPageMetadata(props) { + const { metadata } = props; + const { + siteConfig: { title: siteTitle }, + } = useDocusaurusContext(); + const { blogDescription, blogTitle, permalink } = metadata; + const isBlogOnlyMode = permalink === "/"; + const title = isBlogOnlyMode ? siteTitle : blogTitle; + return ( + <> + + + + ); +} + +function BlogListPageContent(props) { + const { metadata, items } = props; + const [activeFilters, setActiveFilters] = useState([]); + // These are currently hardcoded, check the frontmatter of the blog posts to see what audiences are available + const audiences = ["Data Governance Leads", "Data Engineers", "Data Architects", "Data Platform Leads", "Data Analysts"]; + + const filteredItems = activeFilters?.length + ? (items || []).filter((post) => activeFilters.some((activeFilter) => post?.content?.frontMatter?.audience?.some((a) => a === activeFilter))) + : items; + + const handleFilterToggle = (audience) => { + if (activeFilters.includes(audience)) { + setActiveFilters(activeFilters.filter((filter) => filter !== audience)); + } else { + setActiveFilters([...new Set([...activeFilters, audience])]); + } + }; + + return ( + +
+
+
+
+

DataHub Learn

+

Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.

+
+
+
+ For: + {audiences.map((audience) => ( + + ))} +
+
+
+
+
+ {(filteredItems || []).map(({ content: BlogPostContent }) => ( + + + + ))} +
+ +
+
+ ); +} + +export default function BlogListPage(props) { + return ( + + + + + ); +} \ No newline at end of file diff --git a/docs-website/src/learn/_components/LearnListPage/styles.module.scss b/docs-website/src/learn/_components/LearnListPage/styles.module.scss new file mode 100644 index 00000000000000..d08b48a011de07 --- /dev/null +++ b/docs-website/src/learn/_components/LearnListPage/styles.module.scss @@ -0,0 +1,7 @@ +.filterBar { + display: flex; + justify-content: center; + align-items: center; + gap: 10px; + flex-wrap: wrap; +} \ No newline at end of file diff --git a/docs-website/src/learn/business-glossary.md b/docs-website/src/learn/business-glossary.md new file mode 100644 index 00000000000000..d6b249617fc5ac --- /dev/null +++ b/docs-website/src/learn/business-glossary.md @@ -0,0 +1,120 @@ +--- +title: "What is a Business Glossary and How to Standardize It" +description: Understand how a standardized business glossary aids in achieving consistency, compliance, and efficient data use. +tags: ["Business Glossary", "Use Case", "For Data Governance Leads"] +image: /img/learn/use-case-business-glossary.png +hide_table_of_contents: false +audience: ["Data Governance Leads"] +date: 2024-06-03T05:00 +--- + +# What is a Business Glossary and How to Standardize It + +Understand how a standardized business glossary aids in achieving consistency, compliance, and efficient data use. + + + +## Introduction + +Have you ever faced confusion due to inconsistent business terminology within your organization? This lack of standardization can lead to misunderstandings, compliance issues, and inefficient data use. In this post, we’ll explore the importance of having a standardized business glossary, its benefits, and how you can implement one effectively in your organization. + +## What is a Business Glossary? + +A Business Glossary is like a dictionary for your company. It contains definitions of key business terms that everyone in the organization uses, ensuring everyone speaks the same language, especially when it comes to important concepts related to the data your company collects, processes, and uses. + +For example, below are some sales-related glossary terms that can be used in an IT company. + +| Term | Definition | Usage | +| --- | --- | --- | +| CRM (Customer Relationship Management) | Software that manages a company's interactions with current and potential customers. | CRMs help streamline processes and improve customer relationships. | +| Lead | A potential customer who has shown interest in a company's product or service. | Leads are nurtured by the sales team to convert into customers. | +| Pipeline | The stages through which a sales prospect moves from initial contact to final sale. | Sales pipelines track progress and forecast future sales. | +| Quota | A sales target set for a salesperson or team for a specific period. | Quotas motivate sales teams and measure performance. | +| Conversion Rate | The percentage of leads that turn into actual sales. | High conversion rates indicate effective sales strategies. | +| Upselling | Encouraging customers to purchase a more expensive or upgraded version of a product. | Upselling increases revenue by enhancing the customer purchase. | +| Churn Rate | The percentage of customers who stop using a product or service over a given period. | Reducing churn rate is crucial for maintaining steady growth. | +| MQL (Marketing Qualified Lead) | A lead that has been deemed more likely to become a customer based on marketing efforts. | MQLs are passed from the marketing team to the sales team for further nurturing. | +| ARR (Annual Recurring Revenue) | The amount of revenue that a company expects to receive from its customers on an annual basis for subscriptions. | ARR helps in financial forecasting and performance measurement. | + +## What is Business Glossary Standardization? + +Business glossary standardization means creating and maintaining a consistent set of business terms and definitions used across the organization. This practice is essential for maintaining clarity and consistency in how data is interpreted and used across different departments. + +## Why Should You Care? + +### The Challenge + +Without a consistent understanding and use of business terminology, your company lacks a unified understanding of its data. This can lead to inconsistencies, increased compliance risk, and less effective use of data. Different teams may describe the same concepts in various ways, causing confusion about customers, key metrics, products, marketing, and more. + +### The Benefits + +For a governance lead, standardizing the business glossary is crucial for several reasons: + +- **Reduces Confusion, Facilitates Discovery:** Ensures data quality, consistency, and reliability, which are critical for effective decision-making. +- **Regulatory Compliance:** Aligns data use with regulatory definitions and requirements, essential for compliance with financial regulations. +- **Supports Risk Management:** Provides consistent terminology for analyzing market trends, credit risk, and operational risks. +- **Training and Onboarding:** Helps new employees quickly understand the company’s specific language and metrics, speeding up the training process. + +### Real-World Impact + +Imagine a financial services company where different teams use varied terminologies for the same concepts, such as "customer lifetime value." (CLV) This inconsistency can lead to misinterpretations, faulty risk assessments, and regulatory non-compliance, ultimately affecting the company's reputation and financial stability. + +Here's how different teams might interpret CLV and the potential implications: + +| Team | Interpretation of CLV | Focus | Implications | +| --- | --- | --- | --- | +| Marketing | Total revenue generated from a customer over their entire relationship with the company | Campaign effectiveness, customer acquisition costs, return on marketing investment | Revenue maximization through frequent promotions, potentially ignoring the cost of service and risk associated with certain customer segments | +| Sales | Projected future sales from a customer based on past purchasing behavior | Sales targets, customer retention, cross-selling/up-selling opportunities | Aggressive sales tactics to boost short-term sales, potentially leading to customer churn if the value delivered does not meet | +| Finance | Net present value (NPV), factoring in the time value of money and associated costs over the customer relationship period | Profitability, cost management, financial forecasting | Conservative growth strategies, focusing on high-value, low-risk customers, potentially overlooking opportunities for broader market expansion | + + Different interpretations can lead to conflicting strategies and objectives across teams. For instance, Marketing’s aggressive acquisition strategy may lead to a significant increase in new customers and short-term revenue. However, if Finance’s NPV analysis reveals that these customers are not profitable long-term, the company may face financial strain due to high acquisition costs and low profitability. + + The Sales team’s push for upselling may generate short-term sales increases, aligning with their CLV projections. However, if customers feel pressured and perceive the upsells as unnecessary, this could lead to dissatisfaction and higher churn rates, ultimately reducing the actual lifetime value of these customers. + + The conflicting strategies can result in misaligned priorities, where Marketing focuses on volume, Sales on immediate revenue, and Finance on long-term profitability. This misalignment can lead to inefficient resource allocation, where Marketing spends heavily on acquisition, Sales focuses on short-term gains, and Finance restricts budgets due to profitability concerns. + +### Example Discovery Questions + +- Have you ever experienced confusion or errors due to inconsistent terminology in your organization's data reports? How do you currently manage and standardize business terms across departments? +- If your organization lacks a standardized business glossary, what challenges do you face in ensuring regulatory compliance and reliable data analysis? +- When onboarding new employees, do you find that inconsistent terminology slows down their training and understanding of company data? How could a standardized glossary improve this process? + +## How to Standardize a Business Glossary + +### General Approach + +To standardize a business glossary, start by identifying key business terms and their definitions. Engage stakeholders from various departments to ensure comprehensive coverage and agreement. Regularly update the glossary to reflect changes in business processes and regulatory requirements. + +### Alternatives and Best Practices + +Some companies use manual methods to track data terminology and manage access requests. While these methods can work, they are often inefficient and error-prone. Best practices include using automated tools that provide consistent updates and easy access to the glossary for all employees. + +### Our Solution + +Acryl DataHub offers comprehensive features designed to support the authoring of a unified business glossary for your organization: + +

+ +
+ Business Glossary Center +

+ +- **[Centralized Business Glossary](https://datahubproject.io/docs/glossary/business-glossary):** A repository for all business terms and definitions, ensuring consistency across the organization. + + +

+ +
+ Approval Flows +

+ + +- **[Approval Flows](https://datahubproject.io/docs/managed-datahub/approval-workflows):** Structured workflows for approving changes to the glossary, maintaining quality and consistency through time + +- **Automated Data Classification:** Tools to tag critical data assets - tables, columns, dashboards, and pipelines - with terms from the business glossary using automations and custom rules. + +By implementing these solutions, you can ensure that your business terminology is consistently defined and accurately used across all teams, supporting reliable decision-making and regulatory compliance. + +## Conclusion + +Standardizing your business glossary is essential for maintaining consistency, ensuring compliance, and optimizing data use. By implementing best practices and leveraging advanced tools, you can achieve a more efficient and reliable data management process. This investment will lead to better decision-making, reduced compliance risks, and a more cohesive organizational understanding of data. \ No newline at end of file diff --git a/docs-website/src/learn/business-metric.md b/docs-website/src/learn/business-metric.md new file mode 100644 index 00000000000000..39221a67d40abc --- /dev/null +++ b/docs-website/src/learn/business-metric.md @@ -0,0 +1,87 @@ +--- +title: "What is a Business Metric and How to Define and Standardize Them" +description: Learn the importance of consistent metric definitions and calculation methods to ensure organizational alignment. +tags: ["Business Metric", "Use Case", "For Data Analysts"] +image: /img/learn/use-case-business-metric.png +hide_table_of_contents: false +audience: ["Data Analysts"] +date: 2024-06-03T04:00 +--- + +# What is a Business Metric and How to Define and Standardize Them + +Learn the importance of consistent metric definitions and calculation methods to ensure organizational alignment. + + + +## Introduction + +Have you ever been part of a project where different teams had conflicting definitions for key business metrics like revenue, churn, or weekly active users? This misalignment can cause significant issues, leading to incorrect analysis and poor decision-making. In this post, we will explore the importance of defining and standardizing business metrics, why it matters, and how you can do it effectively within your organization. + +## What is Business Metrics Definition and Standardization? + +Standardizing business metrics definition involves creating consistent and universally understood definitions for key performance indicators (KPIs) across your organization. Think of it as creating a common language that everyone in your company can use when discussing critical metrics like revenue, churn, or engagement. This ensures that all teams are on the same page, which is essential for accurate analysis and strategic decision-making. + +## Why Should You Care About Business Metrics Definition and Standardization? + +### The Challenge + +In many organizations, KPIs are used to drive critical day-to-day operating decisions. They often emerge organically in response to the data needs of management. Over time, organizations can naturally develop inconsistent sources, representations, and vocabulary around such metrics. When there is a lack of consistent understanding of these metrics, it can lead to meaningful discrepancies in data interpretation and decision-making. + +### Importance + +Standardizing business metrics is crucial because these metrics are direct indicators of the performance and health of various functions within an organization. More often than not, these metrics are used for not only making day-to-day operating decisions, but also for reporting out business performance. Standardized metrics provide immediate insight into whether the business is on track to meet its objectives and serve as solid foundations upon which other second-order metrics may be derived. + +### Real-World Impact + +Consider a scenario where the finance team defines revenue differently from the product team. If these discrepancies are not reconciled, it could lead to conflicting reports and misguided strategies. For instance, a marketing campaign analyzed with inconsistent metrics might appear successful in one report and unsuccessful in another, causing confusion and potentially leading to incorrect strategic decisions. Disagreements about the source-of-truth or accuracy of a given metric are commonplace; perhaps you can recall some examples from your own experience. + +### Example Discovery Questions and Explanations + +- **Current Management and Challenges:** "How do you currently manage and standardize definitions for core business metrics across different teams, and what challenges have you encountered in this process?" This question helps to uncover the existing processes and pain points in managing metrics, providing insights into potential areas where our product can offer significant improvements. +- **Educating your Workforce:** “How do you educate new employees about the most important metrics at the organization?” This question helps to recognize and eliminate inefficient sharing of tribal knowledge within an organization when an employee joins or leaves. +- **Impact of Misalignment:** "Can you describe a recent instance where misalignment on metric definitions impacted a business decision or analysis, and how was the issue resolved?" This question aims to highlight the real-world consequences of not having standardized metrics, emphasizing the importance of our solution in preventing such issues. + +## How to Define and Standardize Business Metrics + +### General Approach + +Start by identifying key business metrics that are actively used to power decision making at the organization. Involve stakeholders from different departments to agree on a standard set of definitions, and propose a lightweight process for introducing new ones. Document these definitions and ensure they are easily accessible to everyone in the organization. Regular reviews and updates are necessary to keep the metrics relevant and aligned with business goals. + +### Alternatives and Best Practices + +Some companies try to align metric definitions through emails and meetings. While this is a good place to start, it is often impractical at scale. Instead, best practices involve using a centralized system for defining and discovering key business metrics. Implementing approval flows and lineage tracking can ensure that all changes are reviewed and that the physical origins of a metric - e.g. the actual tables and rows that power it - are immediately clear. By making metrics centrally visible, you can begin to establish accountability and audibility around your key metrics, increasing their reliability through time and improving the quality of your decisions. + +### Our Solution + +Acryl DataHub offers comprehensive features designed to tackle the challenges of defining and standardizing business metrics: + +

+ +
+ Business Glossary Center +

+ + +- **[Business Glossary](https://datahubproject.io/docs/glossary/business-glossary):** A centralized repository for all metrics definitions, ensuring consistency across the organization. + +

+ +
+ Approval Flows +

+ +- **[Approval Flows](https://datahubproject.io/docs/managed-datahub/approval-workflows):** Structured workflows for approving changes to metric definitions, maintaining accuracy and reliability. + - + +![Untitled](https://prod-files-secure.s3.us-west-2.amazonaws.com/f818df0d-1067-44ab-99e1-8cf45d930c01/33ebd070-32a1-4875-b220-c31373f5eedf/Untitled.png) + +- **[Lineage Tracking](https://datahubproject.io/docs/generated/lineage/lineage-feature-guide):** Tools to track the origin and transformations of metrics, ensuring they align with standardized definitions. + - +![Screenshot 2024-07-10 at 12.07.28 PM.png](https://prod-files-secure.s3.us-west-2.amazonaws.com/f818df0d-1067-44ab-99e1-8cf45d930c01/39503957-ad64-4d2d-a5b2-b140abfc1f6c/Screenshot_2024-07-10_at_12.07.28_PM.png) + +By implementing these solutions, you can ensure that your business metrics are consistently defined and accurately used across all teams, supporting reliable analysis and decision-making. + +### Conclusion + +Defining and standardizing business metrics is essential for ensuring consistent, accurate, and reliable data analysis and decision-making within an organization. By implementing best practices and leveraging advanced tools like our product’s business glossary, approval flows, and lineage tracking, you can achieve a more cohesive and efficient approach to managing business metrics. This investment will lead to better insights, more informed decisions, and ultimately, a more successful data-driven organization. \ No newline at end of file diff --git a/docs-website/src/learn/data-freshness.md b/docs-website/src/learn/data-freshness.md new file mode 100644 index 00000000000000..e97e9b054b256d --- /dev/null +++ b/docs-website/src/learn/data-freshness.md @@ -0,0 +1,121 @@ +--- +title: "Ensuring Data Freshness: Why It Matters and How to Achieve It" +description: Explore the significance of maintaining up-to-date data, the challenges involved, and how our solutions can ensure your data remains fresh to meet SLAs. +tags: ["Data Freshness", "Use Case", "For Data Engineers"] +image: /img/learn/use-case-data-freshness.png +hide_table_of_contents: false +audience: ["Data Engineers"] +date: 2024-06-03T01:00 +--- + +# Ensuring Data Freshness: Why It Matters and How to Achieve It + +Explore the significance of maintaining up-to-date data, the challenges involved, and how our solutions can ensure your data remains fresh to meet SLAs. + + + +## Introduction + +Have you ever experienced delays in delivering tables that or machine learning (ML) models that directly power customer experiences due to stale data? Ensuring timely data is crucial for maintaining the effectiveness and reliability of these mission-critical products. In this post, we'll explore the importance of data freshness, the challenges associated with it, and how DataHub can help you meet your data freshness SLAs consistently. + +## What is Data Freshness? + +Data freshness refers to the timeliness and completeness of data used to build tables and ML models. Specifically, freshness can be measured by the difference in time between when some event *actually occurs* vs when that record of that event is reflected in a dataset or used to train an AI model. + +To make things concrete, let’s imagine you run an e-commerce business selling t-shirts. When a user clicks the final “purchase” button to finalize a purchase, this interaction is recorded, eventually winding up in a consolidated “click_events” table on your data warehouse. Data freshness in this case could be measured by comparing when the actual click was performed against when the record of the click landed in the data warehouse. In reality, freshness can be measured against any reference point - e.g. event time, ingestion time, or something else - in relation to when a target table, model, or other data product is updated with new data. + +

+ +
+ Data Freshness +

+ +Oftentimes, data pipelines are designed in order meet some well-defined availability latency, or data freshness SLA, with the specifics of this type of agreement dictating how and when the data pipeline is triggered to run. + +In the modern data landscape, ensuring that data is up-to-date is vital for building high-quality data products, from reporting dashboards used to drive day-to-day company decisions to personalized and dynamic data- or AI-powered product experiences. + +## Why Data Freshness Matters + +For many organizations, fresh data is more than a ‘nice to have’. + +Mission-critical ML models, like those used for price prediction or fraud detection, depend heavily on fresh data to make accurate predictions. Delays in updating these models can lead to lost revenue and damage to your company's reputation. + +Customer-facing data products, for example recommendation features, also need timely updates to ensure that customers receive the most recent and relevant information personalized to them. Delays in data freshness can result in customer frustration, user churn, and loss of trust. + +### Key Considerations for Your Organization + +**Critical Data and ML Models:** + +Can you recall examples when your organization faced challenges in maintaining the timeliness of mission-critical datasets and ML models? If your organization relies on data to deliver concrete product experiences, compliance auditing, or for making high-quality day-to-day decision, then stale data can significantly impact revenue and customer satisfaction. Consider identifying which datasets and models are most critical to your operations and quantifying the business impact of delays. + +**Impact Identification and Response:** + +Because data is highly interconnected, delays in data freshness can lead to cascading problems, particularly of your organization lacks a robust system for identifying and resolving such problems. How does your organization prioritize and manage such incidents? Processes for quickly identifying and resolving root causes are essential for minimizing negative impacts on revenue and reputation. + +**Automated Freshness Monitoring:** + +If data freshness problems often go undetected for long periods of time, there may be opportunities to automate the detection of such problems for core tables and AI models so that your team is first to know when something goes wrong. + +## How to Ensure Data Freshness + +Ensuring data freshness involves several best practices and strategies. Here’s how you can achieve it: + +### Best Practices and Strategies + +**Data Lineage Tracking:** + +Utilize data lineage tracking to establish a bird’s eye view of data flowing through your systems - a picture of the supply chain of data within your organization. This helps in pinpointing hotspots where delays occur and understanding the full impact of such delays to coordinate an effective response. + +**Automation and Monitoring:** + +Implement automated freshness monitoring to detect and address issues promptly. This reduces the need for manual debugging and allows for quicker response times. It can also help you to establish peace-of-mind by targeting your most impactful assets. + +**Incident Management:** + +Establish clear protocols for incident management to prioritize and resolve data freshness issues effectively. This includes setting up notifications and alerts for timely intervention, and a broader communication strategy to involve all stakeholders (even those downstream) in the case of an issue. + +### Alternatives + +While manual investigation and communication using tools like Slack can help triage issues, they often result in time-consuming, inefficient, and informal processes for addressing data quality issues related to freshness, ultimately leading to lower quality outcomes. Automated freshness incident detection and structured incident management via dedicated data monitoring tools can help improve the situation by providing a single place for detecting, communicating, and coordinating to resolve data freshness issues. + +### How DataHub Can Help + +DataHub offers comprehensive features designed to tackle data freshness challenges: + + +**[End-To-End Data Lineage](https://datahubproject.io/docs/generated/lineage/lineage-feature-guide) and [Impact Analysis](https://datahubproject.io/docs/act-on-metadata/impact-analysis):** Easily track the flow of data through your organization to identify, debug, and resolve delays quickly. +

+ +
+ Data Lineage +

+ + +**Freshness Monitoring & Alerting:** Automatically detect and alert when data freshness issues occur, to ensure timely updates by proactively monitoring key datasets for updates. Check out [Assertions](https://datahubproject.io/docs/managed-datahub/observe/assertions) and [Freshness Assertions](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions), Available in **Acryl Managed DataHub Only.** + +

+ +
+ Freshness Assertions Results +

+ + +

+ +
+ Smart assertions checks for changes on a cadence based on the Table history, by default using the Audit Log. +

+ + +**[Incident Management](https://datahubproject.io/docs/incidents/incidents)** : Centralize data incident management and begin to effectively triage, prioritize, communicate and resolve data freshness issues to all relevant stakeholders. Check out [subscription & notification](https://datahubproject.io/docs/managed-datahub/subscription-and-notification) features as well. + +

+ +

+ + +By implementing these solutions, you can ensure that your key datasets and models are always up-to-date, maintaining their relevancy, accuracy, and reliability for critical use cases within your organization. + +## Conclusion + +Ensuring data freshness is essential for the performance and reliability of critical datasets and AI/ML models. By understanding the importance of data freshness and implementing best practices and automated solutions, you can effectively manage and mitigate delays, thereby protecting your revenue and reputation. DataHub is designed to help you achieve this, providing the tools and features necessary to keep your data fresh and your operations running smoothly. \ No newline at end of file diff --git a/docs-website/src/learn/data-mesh.md b/docs-website/src/learn/data-mesh.md new file mode 100644 index 00000000000000..f9a625d103ae71 --- /dev/null +++ b/docs-website/src/learn/data-mesh.md @@ -0,0 +1,131 @@ +--- +title: "What is a Data Mesh and How to Implement It in Your Organization" +description: Learn how a data mesh aligns data management with domain expertise, enhancing overall organizational agility. +tags: ["Data Mesh", "Use Case", "For Data Architects", "For Data Platform Leads"] +image: /img/learn/use-case-data-mesh.png +hide_table_of_contents: false +audience: ["Data Architects", "Data Platform Leads"] +date: 2024-06-03T02:00 +--- + +# What is Data Mesh and How to Implement It in Your Organization + +Learn how a data mesh aligns data management with domain expertise, enhancing overall organizational agility. + + + +## Introduction + +Have you faced challenges in managing decentralized data across various business units or domains? Implementing a [Data Mesh](https://martinfowler.com/articles/data-mesh-principles.html) can address these issues, aligning data management with domain expertise and enhancing your organization’s overall agility. In this post, we'll explore what a Data Mesh is, why it's beneficial, and how to implement it effectively within your organization. + +## What is Data Mesh? + +Data Mesh is a decentralized data architecture that shifts the responsibility of data management from a central team to individual business units, or "domains." Each domain in turn produces “data products”, or consumable data artifacts, ensuring that data management is closely aligned with domain-specific expertise. This approach promotes agility, scalability, and the ability to generate insights more effectively. + +If you’re familiar with [Service-Oriented Architectures](https://en.wikipedia.org/wiki/Service-oriented_architecture), i.e. micro-services, this might sound familiar. Data Mesh is a somewhat analogous concept, but applied to data! + +

+ +
+ 4 Principles of Data Mesh +

+ + +| Principle | Explanation | +| --- | --- | +| Domain Data Ownership | Organizing data into explicit domains based on the structure of your organization, and then assigning clear accountability to each. This enables you to more easily increase the number of sources of data, variety of use cases, and diversity of access models to the data increases. | +| Data as a product | Domain data should be highly accessible and highly reliable by default. It should be easy to discover, easy to understand, easy to access securely, and high quality. | +| Self-Service | Domain teams should be able to independently create, consume, and manage data products on top of a general-purpose platform that can hide the complexity of building, executing and maintaining secure and interoperable data products. | +| Federated Governance | Consistent standards that are enforced by process and technology around interoperability, compliance, and quality. This makes it easy for data consumers to interact with data products across domains in familiar way and ensures quality is maintained uniformly. | + +

+ +
+ Logical architecture of data mesh approach, Image Credit: Zhamak Dehghani +

+ + + +## Why Implement Data Mesh? + +For data architects and data platform leads, implementing a Data Mesh can resolve various challenges associated with managing decentralized data, particularly as you try to scale up. + +Traditional data lakes or warehouses can become central bottlenecks, impairing access, understanding, accountability, and quality of data - ultimately, its usability. These architectures can struggle to meet the diverse needs of different business units, leading to inefficiencies. + +Data Mesh addresses these issues by formally dividing data into decentralized domains, which are owned by the individual teams who are experts in those domains. This approach allows each business unit or domain to manage its own data, enabling independent creation and consumption of data and increasing the agility, reliability, scalability of an organization’s data practice. + +### Key Considerations for Your Organization + +**Decentralized Data Management:** Have you experienced difficulties or bottlenecks in managing data across various business units? Implementing a Data Mesh can alleviate these issues by allowing each domain to build and share its own data products, enhancing agility and scalability. + +**Overcoming Centralized Bottlenecks:** If your organization relies on a centralized data lake or warehouse, or data platform team, have you encountered limitations in scalability or delays in data access and analysis? Data Mesh can help overcome these bottlenecks by “pushing down” data ownership and management to domain experts. + +**Enhancing Agility and Reliability:** How important is it for your organization to respond quickly to market changes and generate insights reliably? By formally defining the responsibilities around data “products”, a data mesh architecture can provide the flexibility and speed needed to stay competitive. + +## How to Implement Data Mesh + +Implementing Data Mesh doesn’t need to be a headache. Here’s how your organization can move towards a better future: + +### Best Practices and Strategies + +**Define Domains and Data Products** + +Formally define the different business units or domains within your organization and define the data products each domain will own and manage, and then begin to organize the data on your existing warehouse or lake around these domains. This ensures clarity and responsibility for data management. + +**Establish Clear Contracts** + +Create a clear set of expectations around what it means to be a domain or data product owner within your organization. Then, build processes and systems to both reinforce and monitor these expectations. This helps maintain consistency and reliability across the organization. + +**Monitor Data Quality** + +Use metadata validation and data quality assertions to ensure that your expectations are being met. This includes setting standards for both data quality - freshness, volume, column validity - as well compliance with your less technical requirements - ownership, data documentation, and data classification. + +**Move Towards Federated Governance** + +Adopt a federated governance model to balance autonomy and control. While domains manage their data products, a central team can oversee governance standards and ensure compliance with organizational policies via a well-defined review process. + +### Alternatives + +While a centralized data lake or warehouse can simplify data governance by virtue of keeping everything in one place, it can become a bottleneck as your data organization grows. Decentralized Data Mesh can provide a more scalable and agile approach, by distributing day-to-day responsibility for accessing, producing, and validating data while enforcing a centralized set of standards and processes. + +### Our Solution + +Acryl DataHub offers a comprehensive set of features designed to support the implementation of a Data Mesh at your organization: + +- **[Data Domains](https://datahubproject.io/docs/domains)**: Clearly define and manage data products within each business unit. +- **[Data Products](https://datahubproject.io/docs/dataproducts):** Ensure each domain owns and manages its data products, promoting autonomy and agility. +- **[Data Contracts](https://datahubproject.io/docs/managed-datahub/observe/data-contract)**: Establish clear agreements between domains to ensure consistency and reliability. + + +

+ +
+ Data Contracts in Acryl DataHub UI +

+ + + +- **[Assertions](https://datahubproject.io/docs/managed-datahub/observe/assertions)** Monitor data quality using freshness, volume, column validity, schema, and custom SQL checks to get notified first when things go wrong + + +

+ +
+ Assertion Results +

+ + + +- **[Metadata Tests](https://datahubproject.io/docs/tests/metadata-tests)**: Monitor and enforce a central set of standards or policies across all of your data assets, e.g. to ensure data documentation, data ownership, and data classification. + +

+ +
+ Metadata Test Results +

+ +By implementing these solutions, you can effectively manage decentralized data, enhance agility, and generate insights more efficiently. + +## Conclusion + +Implementing a Data Mesh can significantly improve your organization's ability to manage and leverage decentralized data. By understanding the benefits of data mesh and following best practices for implementation, you can overcome the limitations of centralized data systems and enhance your agility, scalability, and ability to generate insights. Acryl DataHub was built from the ground up to help you achieve this, providing the tools and features necessary to implement a large-scale Data Mesh successfully. \ No newline at end of file diff --git a/docs-website/src/learn/data-pipeline.md b/docs-website/src/learn/data-pipeline.md new file mode 100644 index 00000000000000..f5e5bb6615f48b --- /dev/null +++ b/docs-website/src/learn/data-pipeline.md @@ -0,0 +1,90 @@ +--- +title: "What is a Data Pipeline and Why Should We Optimize It" +description: Discover the importance of optimizing data pipelines to maintain data freshness and control costs. +tags: ["Data Pipeline", "Use Case", "For Data Engineers"] +image: /img/learn/use-case-data-pipeline.png +hide_table_of_contents: false +audience: ["Data Engineers"] +date: 2024-06-03T03:00 +--- + +# What is a Data Pipeline and Why Should We Optimize It? + +Discover the importance of optimizing data pipelines to maintain data freshness and control costs. + + + +## Introduction + +Have you ever been frustrated by slow and unreliable data pipelines or unexpectedly high cloud bills? In the modern data world, maintaining efficient, reliable, and cost-effective data pipelines is crucial for delivering timely, high-quality data. This post will explore the importance of optimizing data pipelines, why it matters, and how to achieve it effectively. + +## What is a Data Pipeline? + +A data pipeline is a series of processes that move data from one system to another - a key component in the supply chain for data. Think of it like a conveyor belt in a factory, transporting raw materials to different stations where they are processed into the final product. In the context of data, pipelines extract, transform, and load data (ETL) from various sources to destinations like data warehouses, ensuring the data is ready for analysis and use in applications such as machine learning models and business intelligence dashboards. + + +

+ +
+ Data Pipeline Example +

+ +## Why Should You Care About Data Pipeline Optimization? + +### The Problem + +Over time, data pipelines can slow down or become unreliable due to new dependencies, application code bugs, and poorly optimized queries, leading to missed data freshness SLAs and increased cloud costs. For data engineers, this means more time spent on manual debugging and justifying costs to your executives. + +### Importance + +Efficient data pipelines are essential for maintaining the performance of mission-critical tables, dashboards, and ML models powering key use cases for your organization. For example, a price prediction model relies on timely data to provide accurate results, directly impacting revenue. Similarly, outdated customer data can harm a company’s reputation and customer satisfaction. + +### Real-World Impact + +Imagine you’re managing a recommendation engine for an e-commerce site. If your data pipeline is delayed, the recommendations could become outdated, leading to missed sales opportunities - financial costs - and a poor user experience - reputational costs. Alternatively, consider a fraud detection system that relies on real-time data; any delay or downtime could mean the difference between catching fraudulent activity and suffering significant financial loss. + +### Questions To Ask + +- Have you ever noticed a decline in the freshness of crucial data or an uptick in cloud costs for specific pipelines? How do you currently approach diagnosing and optimizing these pipelines? +- If your organization is facing increasing cloud bills due to data pipeline inefficiencies, what strategies or tools do you employ to monitor and optimize costs? How do you balance the trade-off between performance, cost, and meeting business stakeholders' expectations for data delivery? +- Are you taking proactive measures to prevent data pipelines from becoming slower, more fragile, or more expensive over time? Do you have a system in place for regularly reviewing and optimizing key data pipelines to prevent performance or cost degradation? + +## How to Optimize Data Pipelines + +### General Approach + +To optimize your data pipelines, start by identifying bottlenecks and inefficiencies in the pipelines that generate your most mission-critical tables, dashboards, and models. Regularly review and update queries, and monitor pipeline performance by measuring aggregate pipeline run times as well as more granular tracking at the step or query level to catch issues early. Implement automation wherever possible to reduce manual intervention and ensure consistency. + +### Alternatives and Best Practices + +Some companies resort to manual debugging or use communication tools like Slack to triage issues. While these methods can work, they are often time-consuming and prone to errors. Instead, consider leveraging tools that provide lineage tracking, last updated time, and automated monitoring to streamline the optimization process. + +### Our Solution + +Acryl DataHub offers comprehensive features designed to optimize data pipelines: + +

+ +
+ Pipeline Catalog +

+ +- **Pipeline Cataloging:** Quickly browse all of the data pipelines running inside your organization, and track critical human context like pipeline ownership / accountability, purpose / documentation, and compliance labels in one place. + +

+ +
+ Lineage Tracking +

+ +- **[Lineage Tracking](https://datahubproject.io/docs/generated/lineage/lineage-feature-guide) and [Impact Analysis](https://datahubproject.io/docs/act-on-metadata/impact-analysis):** Understand the flow of data through your pipelines to identify and resolve inefficiencies quickly. Easily see which assets are consumed and produced by which pipelines. +- **Freshness Monitoring:** Track the freshness using Freshness Assertions of your data to ensure SLAs are met consistently. +- **Cost Management Tooling:** Monitor and optimize cloud costs associated with your data pipelines to improve cost-efficiency. + +By implementing these solutions, you can ensure that your data pipelines are running efficiently, meeting delivery SLAs, and staying within budget. + + + +## Conclusion + +Optimizing data pipelines is essential for maintaining data reliability, controlling costs, and ultimately ensuring your business continues to run smoothly. By implementing best practices and leveraging advanced tools like our product’s lineage tracking and automated monitoring features, you can achieve efficient and cost-effective data pipelines. Investing time and resources into optimization will ultimately lead to better performance, lower costs, and more satisfied stakeholders. \ No newline at end of file diff --git a/docs-website/src/pages/customer-stories-survey/index.js b/docs-website/src/pages/customer-stories-survey/index.js index 63a3ecd77e9687..e271ad6a309c16 100644 --- a/docs-website/src/pages/customer-stories-survey/index.js +++ b/docs-website/src/pages/customer-stories-survey/index.js @@ -17,7 +17,7 @@ function CustomerStoriesSurvey() { window.hbspt.forms.create({ region: "na1", portalId: "14552909", - formId: "087ef03d-e47e-4814-b458-b30e3e02b623", + formId: "5fbd22ff-4edd-4c43-84bb-7fdaf4e38528", target: '#hubspotForm' // Targeting the div with the specific ID }); } diff --git a/docs-website/static/img/learn/use-case-business-glossary.png b/docs-website/static/img/learn/use-case-business-glossary.png new file mode 100644 index 00000000000000..cb1ed557564694 Binary files /dev/null and b/docs-website/static/img/learn/use-case-business-glossary.png differ diff --git a/docs-website/static/img/learn/use-case-business-metric.png b/docs-website/static/img/learn/use-case-business-metric.png new file mode 100644 index 00000000000000..09893a13b72681 Binary files /dev/null and b/docs-website/static/img/learn/use-case-business-metric.png differ diff --git a/docs-website/static/img/learn/use-case-data-freshness.png b/docs-website/static/img/learn/use-case-data-freshness.png new file mode 100644 index 00000000000000..0f6828521a7d37 Binary files /dev/null and b/docs-website/static/img/learn/use-case-data-freshness.png differ diff --git a/docs-website/static/img/learn/use-case-data-mesh.png b/docs-website/static/img/learn/use-case-data-mesh.png new file mode 100644 index 00000000000000..92143836887524 Binary files /dev/null and b/docs-website/static/img/learn/use-case-data-mesh.png differ diff --git a/docs-website/static/img/learn/use-case-data-pipeline.png b/docs-website/static/img/learn/use-case-data-pipeline.png new file mode 100644 index 00000000000000..c82b42f80a8325 Binary files /dev/null and b/docs-website/static/img/learn/use-case-data-pipeline.png differ diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index e2e624a74e4129..e874f70db02a3f 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -20,11 +20,7 @@ if (extra_pip_extras != "") { def pip_install_command = "VIRTUAL_ENV=${venv_name} ${venv_name}/bin/uv pip install -e ../../metadata-ingestion" -task checkPythonVersion(type: Exec) { - commandLine python_executable, '-c', 'import sys; assert sys.version_info >= (3, 7)' -} - -task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { +task environmentSetup(type: Exec) { def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') outputs.file(sentinel_file) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 065e9454c5d9e0..2d2f6fbd2b0891 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -42,7 +42,7 @@ def get_long_description(): # We remain restrictive on the versions allowed here to prevent # us from being broken by backwards-incompatible changes in the # underlying package. - "openlineage-airflow>=1.2.0,<=1.12.0", + "openlineage-airflow>=1.2.0,<=1.18.0", }, } @@ -142,6 +142,7 @@ def get_long_description(): "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index b154f92fe553f5..4d66dbc860aa9f 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27, py310-airflow28, py310-airflow29 +envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27, py310-airflow28, py311-airflow29 [testenv] use_develop = true @@ -27,7 +27,7 @@ deps = py310-airflow26: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt py310-airflow27: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt py310-airflow28: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt - py310-airflow29: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.1/constraints-3.10.txt + py311-airflow29: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.11.txt # Before pinning to the constraint files, we previously left the dependencies # more open. There were a number of packages for which this caused issues. @@ -55,6 +55,6 @@ commands = [testenv:py310-airflow24] extras = dev,integration-tests,plugin-v2,test-airflow24 -[testenv:py310-airflow{26,27,28,29}] +[testenv:py310-airflow{26,27,28},py311-airflow{29}] extras = dev,integration-tests,plugin-v2 diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 52fefc3c78945a..4e3f1ca91766c2 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -17,7 +17,7 @@ def get_coverage_arg(test_name) { task checkPythonVersion(type: Exec) { commandLine python_executable, '-c', - 'import sys; assert (3, 11) > sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"' + 'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"' } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { diff --git a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl index f777b5d6e12e7b..37e85b6e542bde 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl @@ -62,4 +62,24 @@ record DataHubIngestionSourceInfo { */ extraArgs: optional map[string, string] } + + /** + * The source or origin of the Ingestion Source + * + * Currently CLI and UI do not provide an explicit source. + */ + source: optional record DataHubIngestionSourceSource { + /** + * The source type of the ingestion source + */ + @Searchable = { + "fieldName": "sourceType" + } + type: enum DataHubIngestionSourceSourceType { + /** + * A system internal source, e.g. for running search indexing operations, feature computation, etc. + */ + SYSTEM + } + } } \ No newline at end of file diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java index 88963e60d415d0..09a6cc7c1e4b76 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java @@ -13,10 +13,15 @@ import io.swagger.v3.oas.annotations.OpenAPIDefinition; import io.swagger.v3.oas.annotations.info.Info; import io.swagger.v3.oas.annotations.servers.Server; +import io.swagger.v3.oas.models.Components; import io.swagger.v3.oas.models.OpenAPI; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.springdoc.core.models.GroupedOpenApi; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -38,8 +43,6 @@ public class SpringWebConfig implements WebMvcConfigurer { private static final Set V1_PACKAGES = Set.of("io.datahubproject.openapi.v1"); private static final Set V2_PACKAGES = Set.of("io.datahubproject.openapi.v2"); private static final Set V3_PACKAGES = Set.of("io.datahubproject.openapi.v3"); - private static final Set SCHEMA_REGISTRY_PACKAGES = - Set.of("io.datahubproject.openapi.schema.registry"); private static final Set OPENLINEAGE_PACKAGES = Set.of("io.datahubproject.openapi.openlineage"); @@ -74,14 +77,31 @@ public void addFormatters(FormatterRegistry registry) { public GroupedOpenApi v3OpenApiGroup(final EntityRegistry entityRegistry) { return GroupedOpenApi.builder() .group("10-openapi-v3") - .displayName("DataHub Entities v3 (OpenAPI)") + .displayName("DataHub v3 (OpenAPI)") .addOpenApiCustomizer( openApi -> { OpenAPI v3OpenApi = OpenAPIV3Generator.generateOpenApiSpec(entityRegistry); openApi.setInfo(v3OpenApi.getInfo()); openApi.setTags(Collections.emptyList()); - openApi.setPaths(v3OpenApi.getPaths()); - openApi.setComponents(v3OpenApi.getComponents()); + openApi.getPaths().putAll(v3OpenApi.getPaths()); + // Merge components. Swagger does not provide append method to add components. + final Components components = new Components(); + final Components oComponents = openApi.getComponents(); + final Components v3Components = v3OpenApi.getComponents(); + components + .callbacks(concat(oComponents::getCallbacks, v3Components::getCallbacks)) + .examples(concat(oComponents::getExamples, v3Components::getExamples)) + .extensions(concat(oComponents::getExtensions, v3Components::getExtensions)) + .headers(concat(oComponents::getHeaders, v3Components::getHeaders)) + .links(concat(oComponents::getLinks, v3Components::getLinks)) + .parameters(concat(oComponents::getParameters, v3Components::getParameters)) + .requestBodies( + concat(oComponents::getRequestBodies, v3Components::getRequestBodies)) + .responses(concat(oComponents::getResponses, v3Components::getResponses)) + .schemas(concat(oComponents::getSchemas, v3Components::getSchemas)) + .securitySchemes( + concat(oComponents::getSecuritySchemes, v3Components::getSecuritySchemes)); + openApi.setComponents(components); }) .packagesToScan(V3_PACKAGES.toArray(String[]::new)) .build(); @@ -122,4 +142,14 @@ public GroupedOpenApi openlineageOpenApiGroup() { .packagesToScan(OPENLINEAGE_PACKAGES.toArray(String[]::new)) .build(); } + + /** Concatenates two maps. */ + private Map concat(Supplier> a, Supplier> b) { + return a.get() == null + ? b.get() + : b.get() == null + ? a.get() + : Stream.concat(a.get().entrySet().stream(), b.get().entrySet().stream()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericRelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericRelationshipController.java new file mode 100644 index 00000000000000..efc3d9375e09e7 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericRelationshipController.java @@ -0,0 +1,221 @@ +package io.datahubproject.openapi.controller; + +import static com.linkedin.metadata.authorization.ApiGroup.RELATIONSHIP; +import static com.linkedin.metadata.authorization.ApiOperation.READ; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthUtil; +import com.datahub.authorization.AuthorizerChain; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.metadata.aspect.models.graph.Edge; +import com.linkedin.metadata.aspect.models.graph.RelatedEntities; +import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; +import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.query.filter.RelationshipDirection; +import com.linkedin.metadata.query.filter.RelationshipFilter; +import com.linkedin.metadata.search.utils.QueryUtils; +import io.datahubproject.openapi.exception.UnauthorizedException; +import io.datahubproject.openapi.models.GenericScrollResult; +import io.datahubproject.openapi.v2.models.GenericRelationship; +import io.swagger.v3.oas.annotations.Operation; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; + +public abstract class GenericRelationshipController { + + @Autowired private EntityRegistry entityRegistry; + @Autowired private ElasticSearchGraphService graphService; + @Autowired private AuthorizerChain authorizationChain; + + /** + * Returns relationship edges by type + * + * @param relationshipType the relationship type + * @param count number of results + * @param scrollId scrolling id + * @return list of relation edges + */ + @GetMapping(value = "/{relationshipType}", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "Scroll relationships of the given type.") + public ResponseEntity> getRelationshipsByType( + @PathVariable("relationshipType") String relationshipType, + @RequestParam(value = "count", defaultValue = "10") Integer count, + @RequestParam(value = "scrollId", required = false) String scrollId) { + + Authentication authentication = AuthenticationContext.getAuthentication(); + if (!AuthUtil.isAPIAuthorized(authentication, authorizationChain, RELATIONSHIP, READ)) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + RelatedEntitiesScrollResult result = + graphService.scrollRelatedEntities( + null, + null, + null, + null, + List.of(relationshipType), + new RelationshipFilter().setDirection(RelationshipDirection.UNDIRECTED), + Edge.EDGE_SORT_CRITERION, + scrollId, + count, + null, + null); + + if (!AuthUtil.isAPIAuthorizedUrns( + authentication, + authorizationChain, + RELATIONSHIP, + READ, + result.getEntities().stream() + .flatMap( + edge -> + Stream.of( + UrnUtils.getUrn(edge.getSourceUrn()), + UrnUtils.getUrn(edge.getDestinationUrn()))) + .collect(Collectors.toSet()))) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + return ResponseEntity.ok( + GenericScrollResult.builder() + .results(toGenericRelationships(result.getEntities())) + .scrollId(result.getScrollId()) + .build()); + } + + /** + * Returns edges for a given urn + * + * @param relationshipTypes types of edges + * @param direction direction of the edges + * @param count number of results + * @param scrollId scroll id + * @return urn edges + */ + @GetMapping(value = "/{entityName}/{entityUrn}", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "Scroll relationships from a given entity.") + public ResponseEntity> getRelationshipsByEntity( + @PathVariable("entityName") String entityName, + @PathVariable("entityUrn") String entityUrn, + @RequestParam(value = "relationshipType[]", required = false, defaultValue = "*") + String[] relationshipTypes, + @RequestParam(value = "direction", defaultValue = "OUTGOING") String direction, + @RequestParam(value = "count", defaultValue = "10") Integer count, + @RequestParam(value = "scrollId", required = false) String scrollId) { + + final RelatedEntitiesScrollResult result; + + Authentication authentication = AuthenticationContext.getAuthentication(); + if (!AuthUtil.isAPIAuthorizedUrns( + authentication, + authorizationChain, + RELATIONSHIP, + READ, + List.of(UrnUtils.getUrn(entityUrn)))) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + switch (RelationshipDirection.valueOf(direction.toUpperCase())) { + case INCOMING -> result = + graphService.scrollRelatedEntities( + null, + null, + null, + null, + relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") + ? Arrays.stream(relationshipTypes).toList() + : List.of(), + new RelationshipFilter() + .setDirection(RelationshipDirection.UNDIRECTED) + .setOr(QueryUtils.newFilter("destination.urn", entityUrn).getOr()), + Edge.EDGE_SORT_CRITERION, + scrollId, + count, + null, + null); + case OUTGOING -> result = + graphService.scrollRelatedEntities( + null, + null, + null, + null, + relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") + ? Arrays.stream(relationshipTypes).toList() + : List.of(), + new RelationshipFilter() + .setDirection(RelationshipDirection.UNDIRECTED) + .setOr(QueryUtils.newFilter("source.urn", entityUrn).getOr()), + Edge.EDGE_SORT_CRITERION, + scrollId, + count, + null, + null); + default -> throw new IllegalArgumentException("Direction must be INCOMING or OUTGOING"); + } + + if (!AuthUtil.isAPIAuthorizedUrns( + authentication, + authorizationChain, + RELATIONSHIP, + READ, + result.getEntities().stream() + .flatMap( + edge -> + Stream.of( + UrnUtils.getUrn(edge.getSourceUrn()), + UrnUtils.getUrn(edge.getDestinationUrn()))) + .collect(Collectors.toSet()))) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + return ResponseEntity.ok( + GenericScrollResult.builder() + .results(toGenericRelationships(result.getEntities())) + .scrollId(result.getScrollId()) + .build()); + } + + private List toGenericRelationships(List relatedEntities) { + return relatedEntities.stream() + .map( + result -> { + Urn source = UrnUtils.getUrn(result.getSourceUrn()); + Urn dest = UrnUtils.getUrn(result.getDestinationUrn()); + return GenericRelationship.builder() + .relationshipType(result.getRelationshipType()) + .source(GenericRelationship.GenericNode.fromUrn(source)) + .destination(GenericRelationship.GenericNode.fromUrn(dest)) + .build(); + }) + .collect(Collectors.toList()); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java index 3e46e10857fbd8..a0412676b5cbce 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java @@ -1,40 +1,10 @@ package io.datahubproject.openapi.v2.controller; -import static com.linkedin.metadata.authorization.ApiGroup.RELATIONSHIP; -import static com.linkedin.metadata.authorization.ApiOperation.READ; - -import com.datahub.authentication.Authentication; -import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.AuthUtil; -import com.datahub.authorization.AuthorizerChain; -import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; -import com.linkedin.metadata.aspect.models.graph.Edge; -import com.linkedin.metadata.aspect.models.graph.RelatedEntities; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; -import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; -import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.query.filter.RelationshipDirection; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.search.utils.QueryUtils; -import io.datahubproject.openapi.exception.UnauthorizedException; -import io.datahubproject.openapi.models.GenericScrollResult; -import io.datahubproject.openapi.v2.models.GenericRelationship; -import io.swagger.v3.oas.annotations.Operation; +import io.datahubproject.openapi.controller.GenericRelationshipController; import io.swagger.v3.oas.annotations.tags.Tag; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.http.MediaType; -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; @RestController @@ -44,192 +14,6 @@ @Tag( name = "Generic Relationships", description = "APIs for ingesting and accessing entity relationships.") -public class RelationshipController { - - @Autowired private EntityRegistry entityRegistry; - @Autowired private ElasticSearchGraphService graphService; - @Autowired private AuthorizerChain authorizationChain; - - /** - * Returns relationship edges by type - * - * @param relationshipType the relationship type - * @param count number of results - * @param scrollId scrolling id - * @return list of relation edges - */ - @GetMapping(value = "/{relationshipType}", produces = MediaType.APPLICATION_JSON_VALUE) - @Operation(summary = "Scroll relationships of the given type.") - public ResponseEntity> getRelationshipsByType( - @PathVariable("relationshipType") String relationshipType, - @RequestParam(value = "count", defaultValue = "10") Integer count, - @RequestParam(value = "scrollId", required = false) String scrollId) { - - Authentication authentication = AuthenticationContext.getAuthentication(); - if (!AuthUtil.isAPIAuthorized(authentication, authorizationChain, RELATIONSHIP, READ)) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - RelatedEntitiesScrollResult result = - graphService.scrollRelatedEntities( - null, - null, - null, - null, - List.of(relationshipType), - new RelationshipFilter().setDirection(RelationshipDirection.UNDIRECTED), - Edge.EDGE_SORT_CRITERION, - scrollId, - count, - null, - null); - - if (!AuthUtil.isAPIAuthorizedUrns( - authentication, - authorizationChain, - RELATIONSHIP, - READ, - result.getEntities().stream() - .flatMap( - edge -> - Stream.of( - UrnUtils.getUrn(edge.getSourceUrn()), - UrnUtils.getUrn(edge.getDestinationUrn()))) - .collect(Collectors.toSet()))) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - return ResponseEntity.ok( - GenericScrollResult.builder() - .results(toGenericRelationships(result.getEntities())) - .scrollId(result.getScrollId()) - .build()); - } - - /** - * Returns edges for a given urn - * - * @param relationshipTypes types of edges - * @param direction direction of the edges - * @param count number of results - * @param scrollId scroll id - * @return urn edges - */ - @GetMapping(value = "/{entityName}/{entityUrn}", produces = MediaType.APPLICATION_JSON_VALUE) - @Operation(summary = "Scroll relationships from a given entity.") - public ResponseEntity> getRelationshipsByEntity( - @PathVariable("entityName") String entityName, - @PathVariable("entityUrn") String entityUrn, - @RequestParam(value = "relationshipType[]", required = false, defaultValue = "*") - String[] relationshipTypes, - @RequestParam(value = "direction", defaultValue = "OUTGOING") String direction, - @RequestParam(value = "count", defaultValue = "10") Integer count, - @RequestParam(value = "scrollId", required = false) String scrollId) { - - final RelatedEntitiesScrollResult result; - - Authentication authentication = AuthenticationContext.getAuthentication(); - if (!AuthUtil.isAPIAuthorizedUrns( - authentication, - authorizationChain, - RELATIONSHIP, - READ, - List.of(UrnUtils.getUrn(entityUrn)))) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - switch (RelationshipDirection.valueOf(direction.toUpperCase())) { - case INCOMING -> result = - graphService.scrollRelatedEntities( - null, - null, - null, - null, - relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") - ? Arrays.stream(relationshipTypes).toList() - : List.of(), - new RelationshipFilter() - .setDirection(RelationshipDirection.UNDIRECTED) - .setOr(QueryUtils.newFilter("destination.urn", entityUrn).getOr()), - Edge.EDGE_SORT_CRITERION, - scrollId, - count, - null, - null); - case OUTGOING -> result = - graphService.scrollRelatedEntities( - null, - null, - null, - null, - relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") - ? Arrays.stream(relationshipTypes).toList() - : List.of(), - new RelationshipFilter() - .setDirection(RelationshipDirection.UNDIRECTED) - .setOr(QueryUtils.newFilter("source.urn", entityUrn).getOr()), - Edge.EDGE_SORT_CRITERION, - scrollId, - count, - null, - null); - default -> throw new IllegalArgumentException("Direction must be INCOMING or OUTGOING"); - } - - if (!AuthUtil.isAPIAuthorizedUrns( - authentication, - authorizationChain, - RELATIONSHIP, - READ, - result.getEntities().stream() - .flatMap( - edge -> - Stream.of( - UrnUtils.getUrn(edge.getSourceUrn()), - UrnUtils.getUrn(edge.getDestinationUrn()))) - .collect(Collectors.toSet()))) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - return ResponseEntity.ok( - GenericScrollResult.builder() - .results(toGenericRelationships(result.getEntities())) - .scrollId(result.getScrollId()) - .build()); - } - - private List toGenericRelationships(List relatedEntities) { - return relatedEntities.stream() - .map( - result -> { - Urn source = UrnUtils.getUrn(result.getSourceUrn()); - Urn dest = UrnUtils.getUrn(result.getDestinationUrn()); - return GenericRelationship.builder() - .relationshipType(result.getRelationshipType()) - .source(GenericRelationship.GenericNode.fromUrn(source)) - .destination(GenericRelationship.GenericNode.fromUrn(dest)) - .build(); - }) - .collect(Collectors.toList()); - } +public class RelationshipController extends GenericRelationshipController { + // Supports same methods as GenericRelationshipController. } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/RelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/RelationshipController.java new file mode 100644 index 00000000000000..8f317e86227239 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/RelationshipController.java @@ -0,0 +1,19 @@ +package io.datahubproject.openapi.v3.controller; + +import io.datahubproject.openapi.controller.GenericRelationshipController; +import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +@RestController("RelationshipControllerV3") +@RequiredArgsConstructor +@RequestMapping("/v3/relationship") +@Slf4j +@Tag( + name = "Generic Relationships", + description = "APIs for ingesting and accessing entity relationships.") +public class RelationshipController extends GenericRelationshipController { + // Supports same methods as GenericRelationshipController. +} diff --git a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java index 0855ad6c2e4ff2..3c7e93621f5cce 100644 --- a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java +++ b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java @@ -25,6 +25,7 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortOrder; @@ -189,6 +190,12 @@ public EntityRegistry entityRegistry( return testOperationContext.getEntityRegistry(); } + @Bean("graphService") + @Primary + public ElasticSearchGraphService graphService() { + return mock(ElasticSearchGraphService.class); + } + @Bean public AuthorizerChain authorizerChain() { AuthorizerChain authorizerChain = mock(AuthorizerChain.class);