From d0b25a35cb3e1a1bd5409810535611ad167f653b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:05:44 -0700 Subject: [PATCH 01/33] build(deps): bump braces from 3.0.2 to 3.0.3 in /docs-website (#10681) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs-website/yarn.lock | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 61bbc305b89177..a93b0e74c327db 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -1827,7 +1827,7 @@ "@docusaurus/theme-search-algolia" "2.4.3" "@docusaurus/types" "2.4.3" -"@docusaurus/react-loadable@5.5.2", "react-loadable@npm:@docusaurus/react-loadable@5.5.2": +"@docusaurus/react-loadable@5.5.2": version "5.5.2" resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== @@ -4174,11 +4174,11 @@ brace-expansion@^1.1.7: concat-map "0.0.1" braces@^3.0.2, braces@~3.0.2: - version "3.0.2" - resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" - integrity sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A== + version "3.0.3" + resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.3.tgz#490332f40919452272d55a8480adc0c441358789" + integrity sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA== dependencies: - fill-range "^7.0.1" + fill-range "^7.1.1" browserslist@^4.0.0, browserslist@^4.18.1, browserslist@^4.21.10, browserslist@^4.21.4, browserslist@^4.22.2, browserslist@^4.23.0: version "4.23.0" @@ -5657,10 +5657,10 @@ filesize@^8.0.6: resolved "https://registry.yarnpkg.com/filesize/-/filesize-8.0.7.tgz#695e70d80f4e47012c132d57a059e80c6b580bd8" integrity sha512-pjmC+bkIF8XI7fWaH8KxHcZL3DPybs1roSKP4rKDvy20tAWwIObE4+JIseG2byfGKhud5ZnM4YSGKBz7Sh0ndQ== -fill-range@^7.0.1: - version "7.0.1" - resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40" - integrity sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ== +fill-range@^7.1.1: + version "7.1.1" + resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.1.1.tgz#44265d3cac07e3ea7dc247516380643754a05292" + integrity sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg== dependencies: to-regex-range "^5.0.1" @@ -9705,6 +9705,14 @@ react-loadable-ssr-addon-v5-slorber@^1.0.1: dependencies: "@babel/runtime" "^7.10.3" +"react-loadable@npm:@docusaurus/react-loadable@5.5.2": + version "5.5.2" + resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" + integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== + dependencies: + "@types/react" "*" + prop-types "^15.6.2" + react-markdown@^8.0.6: version "8.0.7" resolved "https://registry.yarnpkg.com/react-markdown/-/react-markdown-8.0.7.tgz#c8dbd1b9ba5f1c5e7e5f2a44de465a3caafdf89b" From bc0c06a26d70b3ba604aba9a0d71b9031298a7c8 Mon Sep 17 00:00:00 2001 From: Amanda Ng <10681923+ngamanda@users.noreply.github.com> Date: Sat, 29 Jun 2024 05:07:15 +0800 Subject: [PATCH 02/33] feat(ui): display chart query if it exists (#10672) --- .../src/app/entity/chart/ChartEntity.tsx | 9 +++ .../src/app/entity/chart/ChartQueryTab.tsx | 61 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 datahub-web-react/src/app/entity/chart/ChartQueryTab.tsx diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index 2a54a4a96c6393..913d502972fe14 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -28,6 +28,7 @@ import { LOOKER_URN } from '../../ingest/source/builder/constants'; import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; import { IncidentTab } from '../shared/tabs/Incident/IncidentTab'; +import { ChartQueryTab } from './ChartQueryTab'; /** * Definition of the DataHub Chart entity. @@ -110,6 +111,14 @@ export class ChartEntity implements Entity { component: ChartStatsSummarySubHeader, }} tabs={[ + { + name: 'Query', + component: ChartQueryTab, + display: { + visible: (_, chart: GetChartQuery) => (chart?.chart?.query?.rawQuery && true) || false, + enabled: (_, chart: GetChartQuery) => (chart?.chart?.query?.rawQuery && true) || false, + }, + }, { name: 'Documentation', component: DocumentationTab, diff --git a/datahub-web-react/src/app/entity/chart/ChartQueryTab.tsx b/datahub-web-react/src/app/entity/chart/ChartQueryTab.tsx new file mode 100644 index 00000000000000..7c28f4be88d8d5 --- /dev/null +++ b/datahub-web-react/src/app/entity/chart/ChartQueryTab.tsx @@ -0,0 +1,61 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components'; +import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; +import { GetChartQuery } from '../../../graphql/chart.generated'; +import { ANTD_GRAY } from '../shared/constants'; +import { useBaseEntity } from '../shared/EntityContext'; +import { InfoItem } from '../shared/components/styled/InfoItem'; + +const InfoSection = styled.div` + border-bottom: 1px solid ${ANTD_GRAY[4.5]}; + padding: 16px 20px; +`; + +const InfoItemContainer = styled.div<{ justifyContent }>` + display: flex; + position: relative; + justify-content: ${(props) => props.justifyContent}; + padding: 12px 2px; +`; + +const InfoItemContent = styled.div` + padding-top: 8px; +`; + +const QueryText = styled(Typography.Paragraph)` + margin-top: 20px; + background-color: ${ANTD_GRAY[2]}; +`; + +// NOTE: Yes, using `!important` is a shame. However, the SyntaxHighlighter is applying styles directly +// to the component, so there's no way around this +const NestedSyntax = styled(SyntaxHighlighter)` + background-color: transparent !important; + border: none !important; +`; + +export function ChartQueryTab() { + const baseEntity = useBaseEntity(); + const query = baseEntity?.chart?.query?.rawQuery || 'UNKNOWN'; + const type = baseEntity?.chart?.query?.type || 'UNKNOWN'; + + return ( + <> + + Details + + + {type.toUpperCase()} + + + + + Query + + {query} + + + + ); +} From cf3ccf0ba9f9042f19109db3888332dd51a0c295 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Sat, 29 Jun 2024 06:28:13 +0900 Subject: [PATCH 03/33] docs: update api overview (#10543) Co-authored-by: Harshal Sheth --- docs/api/datahub-apis.md | 74 ++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/docs/api/datahub-apis.md b/docs/api/datahub-apis.md index ed48eb0f52fa25..6bb793a59a86e8 100644 --- a/docs/api/datahub-apis.md +++ b/docs/api/datahub-apis.md @@ -1,58 +1,44 @@ -# Which DataHub API is for me? +# DataHub APIs + +DataHub has several APIs to manipulate metadata on the platform. Here's the list of APIs and their pros and cons to help you choose the right one for your use case. + +| API | Definition | Pros | Cons | +|--------------------------------------------------------------------------------|------------------------------------|------------------------------------------|-------------------------------------------------------------| +| **[Python SDK](/metadata-ingestion/as-a-library.md)** | SDK | Highly flexible, Good for bulk execution | Requires an understanding of the metadata change event | +| **[Java SDK](/metadata-integration/java/as-a-library.md)** | SDK | Highly flexible, Good for bulk execution | Requires an understanding of the metadata change event | +| **[GraphQL API](docs/api/graphql/getting-started.md)** | GraphQL interface | Intuitive; mirrors UI capabilities | Less flexible than SDKs; requires knowledge of GraphQL syntax | +| **[OpenAPI](docs/api/openapi/openapi-usage-guide.md)**
(Not Recommended) | Lower-level API for advanced users | | Generally not recommended for typical use cases | + +In general, **Python and Java SDKs** are our most recommended tools for extending and customizing the behavior of your DataHub instance. +We don't recommend using the **OpenAPI** directly, as it's more complex and less user-friendly than the other APIs. -DataHub supplys several APIs to manipulate metadata on the platform. These are our most-to-least recommended approaches: -- Our most recommended tools for extending and customizing the behavior of your DataHub instance are our SDKs in [Python](metadata-ingestion/as-a-library.md) and [Java](metadata-integration/java/as-a-library.md). -- If you'd like to customize the DataHub client or roll your own; the [GraphQL API](docs/api/graphql/getting-started.md) is our what powers our frontend. We figure if it's good enough for us, it's good enough for everyone! If `graphql` doesn't cover everything in your usecase, drop into [our slack](docs/slack.md) and let us know how we can improve it! -- If you are less familiar with `graphql` and would rather use OpenAPI, we offer [OpenAPI](docs/api/openapi/openapi-usage-guide.md) endpoints that allow you to produce metadata events and query metadata. -- Finally, if you're a brave soul and know exactly what you are doing... are you sure you don't just want to use the SDK directly? If you insist, the [Rest.li API](docs/api/restli/restli-overview.md) is a much more powerful, low level API intended only for advanced users. ## Python and Java SDK -We offer an SDK for both Python and Java that provide full functionality when it comes to CRUD operations and any complex functionality you may want to build into DataHub. - -Get started with the Python SDK - +We offer an SDK for both Python and Java that provide full functionality when it comes to CRUD operations and any complex functionality you may want to build into DataHub. We recommend using the SDKs for most use cases. Here are the examples of how to use the SDKs: + +- Define a lineage between data entities +- Executing bulk operations - e.g. adding tags to multiple datasets +- Creating custom metadata entities + +Learn more about the SDKs: +- **[Python SDK →](/metadata-ingestion/as-a-library.md)** +- **[Java SDK →](/metadata-integration/java/as-a-library.md)** - -Get started with the Java SDK - ## GraphQL API The `graphql` API serves as the primary public API for the platform. It can be used to fetch and update metadata programatically in the language of your choice. Intended as a higher-level API that simplifies the most common operations. - -Get started with the GraphQL API - - -## OpenAPI - -For developers who prefer OpenAPI to GraphQL for programmatic operations. Provides lower-level API access to the entire DataHub metadata model for writes, reads and queries. - -Get started with OpenAPI - - -## Rest.li API - -:::caution -The Rest.li API is intended only for advanced users. If you're just getting started with DataHub, we recommend the GraphQL API -::: - -The Rest.li API represents the underlying persistence layer, and exposes the raw PDL models used in storage. Under the hood, it powers the GraphQL API. Aside from that, it is also used for system-specific ingestion of metadata, being used by the Metadata Ingestion Framework for pushing metadata into DataHub directly. For all intents and purposes, the Rest.li API is considered system-internal, meaning DataHub components are the only ones to consume this API directly. - -Get started with our Rest.li API - +We recommend using the GraphQL API if you're getting started with DataHub since it's more user-friendly and straighfowrad. Here are some examples of how to use the GraphQL API: +- Search for datasets with conditions +- Update a certain field of a dataset + +Learn more about the GraphQL API: +- **[GraphQL API →](docs/api/graphql/getting-started.md)** + + ## DataHub API Comparison From 71e79ba40da6ee5b2eb6c232e83cec26d40f4b16 Mon Sep 17 00:00:00 2001 From: PeamThom <144461478+PeamThom@users.noreply.github.com> Date: Sat, 29 Jun 2024 04:31:22 +0700 Subject: [PATCH 04/33] refactor(web-react): add encoder to support non-ASCII characters csv download (#10496) --- datahub-web-react/src/app/search/utils/csvUtils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/search/utils/csvUtils.ts b/datahub-web-react/src/app/search/utils/csvUtils.ts index f701b90ab78e4c..cb29f5e4868240 100644 --- a/datahub-web-react/src/app/search/utils/csvUtils.ts +++ b/datahub-web-react/src/app/search/utils/csvUtils.ts @@ -1,5 +1,5 @@ export function downloadFile(data: string, title: string) { - const blobx = new Blob([data], { type: 'text/plain' }); // ! Blob + const blobx = new Blob([data], { type: 'text/plain;chartset=utf-8' }); // ! Blob const elemx = window.document.createElement('a'); elemx.href = window.URL.createObjectURL(blobx); // ! createObjectURL elemx.download = title; From 21706a4d6b8079f7be6960157ad846f386ea21c2 Mon Sep 17 00:00:00 2001 From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com> Date: Fri, 28 Jun 2024 17:41:12 -0400 Subject: [PATCH 05/33] fix(docs) adding dataset column tags docs (#10479) --- metadata-models/docs/entities/dataset.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-models/docs/entities/dataset.md b/metadata-models/docs/entities/dataset.md index 57bbf2194366f4..3d460406029ac7 100644 --- a/metadata-models/docs/entities/dataset.md +++ b/metadata-models/docs/entities/dataset.md @@ -88,7 +88,7 @@ Here is an example of how you can add a tag to a field in a dataset using the lo Python SDK: Add a tag to a column (field) of a dataset ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_column_term.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_column_tag.py show_path_as_comment }} ``` From d299c790f62d55bbc0d9b9b4368bc81a65f8faa2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:47:20 -0700 Subject: [PATCH 06/33] build(deps): bump ejs from 3.1.9 to 3.1.10 in /datahub-web-react (#10417) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- datahub-web-react/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/yarn.lock b/datahub-web-react/yarn.lock index edd8c337c23fd5..9b6503f1c4181e 100644 --- a/datahub-web-react/yarn.lock +++ b/datahub-web-react/yarn.lock @@ -5462,9 +5462,9 @@ duplexer@^0.1.2: integrity sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg== ejs@^3.1.5: - version "3.1.9" - resolved "https://registry.yarnpkg.com/ejs/-/ejs-3.1.9.tgz#03c9e8777fe12686a9effcef22303ca3d8eeb361" - integrity sha512-rC+QVNMJWv+MtPgkt0y+0rVEIdbtxVADApW9JXrUVlzHetgcyczP/E7DJmWJ4fJCZF2cPcBk0laWO9ZHMG3DmQ== + version "3.1.10" + resolved "https://registry.yarnpkg.com/ejs/-/ejs-3.1.10.tgz#69ab8358b14e896f80cc39e62087b88500c3ac3b" + integrity sha512-UeJmFfOrAQS8OJWPZ4qtgHyWExa088/MtK5UEyoJGFH67cDEXkZSviOiKRCZ4Xij0zxI3JECgYs3oKx+AizQBA== dependencies: jake "^10.8.5" From 9367f26b5bb1ef8fb8c075753dc6f927e27ad49f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Fri, 28 Jun 2024 23:49:52 +0200 Subject: [PATCH 07/33] fix(metadata-service): consider missing entities in form assignment hook (#10392) --- .../kafka/hook/form/FormAssignmentHook.java | 3 --- .../SearchBasedFormAssignmentManager.java | 20 ++++++++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/form/FormAssignmentHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/form/FormAssignmentHook.java index d06c6f5102dfb5..8d093fe0b8a12d 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/form/FormAssignmentHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/form/FormAssignmentHook.java @@ -38,9 +38,6 @@ *

3. When a form is hard deleted, any automations used for assigning the form, or validating * prompts, are automatically deleted. * - *

Note that currently, Datasets, Dashboards, Charts, Data Jobs, Data Flows, Containers, are the - * only asset types supported for this hook. - * *

TODO: In the future, let's decide whether we want to support automations to auto-mark form * prompts as "completed" when they do in fact have the correct metadata. (Without user needing to * explicitly fill out a form prompt response) diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/service/util/SearchBasedFormAssignmentManager.java b/metadata-service/services/src/main/java/com/linkedin/metadata/service/util/SearchBasedFormAssignmentManager.java index 1f1ebaf520c43c..8a3eb463aa15c5 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/service/util/SearchBasedFormAssignmentManager.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/service/util/SearchBasedFormAssignmentManager.java @@ -18,7 +18,25 @@ public class SearchBasedFormAssignmentManager { private static final ImmutableList ENTITY_TYPES = - ImmutableList.of(Constants.DATASET_ENTITY_NAME); + ImmutableList.of( + Constants.DATASET_ENTITY_NAME, + Constants.DATA_JOB_ENTITY_NAME, + Constants.DATA_FLOW_ENTITY_NAME, + Constants.CHART_ENTITY_NAME, + Constants.DASHBOARD_ENTITY_NAME, + Constants.CORP_USER_ENTITY_NAME, + Constants.CORP_GROUP_ENTITY_NAME, + Constants.DOMAIN_ENTITY_NAME, + Constants.CONTAINER_ENTITY_NAME, + Constants.GLOSSARY_TERM_ENTITY_NAME, + Constants.GLOSSARY_NODE_ENTITY_NAME, + Constants.ML_MODEL_ENTITY_NAME, + Constants.ML_MODEL_GROUP_ENTITY_NAME, + Constants.ML_FEATURE_TABLE_ENTITY_NAME, + Constants.ML_FEATURE_ENTITY_NAME, + Constants.ML_PRIMARY_KEY_ENTITY_NAME, + Constants.DATA_PRODUCT_ENTITY_NAME, + Constants.SCHEMA_FIELD_ENTITY_NAME); public static void apply( OperationContext opContext, From b223281305ca106fd900423de8464ffab7e36a90 Mon Sep 17 00:00:00 2001 From: Teppo Naakka Date: Sat, 29 Jun 2024 00:50:08 +0300 Subject: [PATCH 08/33] feat(ingest/powerbi): powerbi dataset profiling (#9355) --- docs/quick-ingestion-guides/powerbi/setup.md | 2 +- .../docs/sources/powerbi/powerbi_pre.md | 9 + .../docs/sources/powerbi/powerbi_recipe.yml | 10 + .../ingestion/source/powerbi/config.py | 19 + .../ingestion/source/powerbi/powerbi.py | 61 ++++ .../powerbi/rest_api_wrapper/data_classes.py | 12 + .../powerbi/rest_api_wrapper/data_resolver.py | 162 ++++++++- .../powerbi/rest_api_wrapper/powerbi_api.py | 60 ++-- .../rest_api_wrapper/profiling_utils.py | 46 +++ .../source/powerbi/rest_api_wrapper/query.py | 17 + .../powerbi/golden_test_profiling.json | 170 +++++++++ .../integration/powerbi/test_profiling.py | 338 ++++++++++++++++++ 12 files changed, 876 insertions(+), 30 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/query.py create mode 100644 metadata-ingestion/tests/integration/powerbi/golden_test_profiling.json create mode 100644 metadata-ingestion/tests/integration/powerbi/test_profiling.py diff --git a/docs/quick-ingestion-guides/powerbi/setup.md b/docs/quick-ingestion-guides/powerbi/setup.md index 0aed3559fd3e5b..0ff52d1351f76a 100644 --- a/docs/quick-ingestion-guides/powerbi/setup.md +++ b/docs/quick-ingestion-guides/powerbi/setup.md @@ -70,7 +70,7 @@ In order to configure ingestion from PowerBI, you'll first have to ensure you ha - `Enhance admin APIs responses with detailed metadata` - `Enhance admin APIs responses with DAX and mashup expressions` - f. **Add Security Group to Workspace:** Navigate to `Workspaces` window and open workspace which you want to ingest as shown in below screenshot and click on `Access` and add `powerbi-connector-app-security-group` as member + f. **Add Security Group to Workspace:** Navigate to `Workspaces` window and open workspace which you want to ingest as shown in below screenshot and click on `Access` and add `powerbi-connector-app-security-group` as member. For most cases `Viewer` role is enough, but for profiling the `Contributor` role is required.

workspace-window-underlined diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index 52fe1cd77e8fde..b581e5fc8f70df 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -108,6 +108,14 @@ By default, extracting endorsement information to tags is disabled. The feature Please note that the default implementation overwrites tags for the ingested entities, if you need to preserve existing tags, consider using a [transformer](../../../../metadata-ingestion/docs/transformer/dataset_transformer.md#simple-add-dataset-globaltags) with `semantics: PATCH` tags instead of `OVERWRITE`. +## Profiling + +The profiling implementation is done through querying [DAX query endpoint](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries). Therefore the principal needs to have permission to query the datasets to be profiled. Usually this means that the service principal should have `Contributor` role for the workspace to be ingested. Profiling is done with column based queries to be able to handle wide datasets without timeouts. + +Take into account that the profiling implementation exeutes fairly big amount of DAX queries and for big datasets this is substantial load to the PowerBI system. + +The `profiling_pattern` setting may be used to limit profiling actions to only a certain set of resources in PowerBI. Both allow and deny rules are matched against following pattern for every table in a PowerBI Dataset: `workspace_name.dataset_name.table_name`. User may limit profiling with these settings at table level, dataset level or workspace level. + ## Admin Ingestion vs. Basic Ingestion PowerBI provides two sets of API i.e. [Basic API and Admin API](https://learn.microsoft.com/en-us/rest/api/power-bi/). @@ -140,6 +148,7 @@ If you don't want to add a service principal as a member in your workspace, then Caveats of setting `admin_apis_only` to `true`: - Report's pages would not get ingested as page API is not available in PowerBI Admin API - [PowerBI Parameters](https://learn.microsoft.com/en-us/power-query/power-query-query-parameters) would not get resolved to actual values while processing M-Query for table lineage + - Dataset profiling is unavailable, as it requires access to the workspace API ### Basic Ingestion: Service Principal As Member In Workspace diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml b/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml index 31eed0bddaa6a8..ebd3dd50cfebc7 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml @@ -64,6 +64,16 @@ source: # extract powerbi dataset table schema extract_dataset_schema: true + # Enable PowerBI dataset profiling + profiling: + enabled: false + # Pattern to limit which resources to profile + # Matched resource format is following: + # workspace_name.dataset_name.table_name + profile_pattern: + deny: + - .* + sink: # sink configs diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 6e74bfda3743c7..bd80433bc2e6ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -46,6 +46,7 @@ class Constant: Authorization = "Authorization" WORKSPACE_ID = "workspaceId" DASHBOARD_ID = "powerbi.linkedin.com/dashboards/{}" + DATASET_EXECUTE_QUERIES = "DATASET_EXECUTE_QUERIES_POST" DATASET_ID = "datasetId" REPORT_ID = "reportId" SCAN_ID = "ScanId" @@ -59,9 +60,12 @@ class Constant: STATUS = "status" CHART_ID = "powerbi.linkedin.com/charts/{}" CHART_KEY = "chartKey" + COLUMN_TYPE = "columnType" + DATA_TYPE = "dataType" DASHBOARD = "dashboard" DASHBOARDS = "dashboards" DASHBOARD_KEY = "dashboardKey" + DESCRIPTION = "description" OWNERSHIP = "ownership" BROWSERPATH = "browsePaths" DASHBOARD_INFO = "dashboardInfo" @@ -108,6 +112,7 @@ class Constant: TABLES = "tables" EXPRESSION = "expression" SOURCE = "source" + SCHEMA_METADATA = "schemaMetadata" PLATFORM_NAME = "powerbi" REPORT_TYPE_NAME = BIAssetSubTypes.REPORT CHART_COUNT = "chartCount" @@ -238,6 +243,13 @@ class OwnershipMapping(ConfigModel): ) +class PowerBiProfilingConfig(ConfigModel): + enabled: bool = pydantic.Field( + default=False, + description="Whether profiling of PowerBI datasets should be done", + ) + + class PowerBiDashboardSourceConfig( StatefulIngestionConfigBase, DatasetSourceConfigMixin ): @@ -421,6 +433,13 @@ class PowerBiDashboardSourceConfig( "Works for M-Query where native SQL is used for transformation.", ) + profile_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns to filter tables for profiling during ingestion. Note that only tables " + "allowed by the `table_pattern` will be considered. Matched format is 'workspacename.datasetname.tablename'", + ) + profiling: PowerBiProfilingConfig = PowerBiProfilingConfig() + @root_validator(skip_on_failure=True) def validate_extract_column_level_lineage(cls, values: Dict) -> Dict: flags = [ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 16f174525254dc..de4eaf6b64434f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -67,7 +67,9 @@ CorpUserKeyClass, DashboardInfoClass, DashboardKeyClass, + DatasetFieldProfileClass, DatasetLineageTypeClass, + DatasetProfileClass, DatasetPropertiesClass, GlobalTagsClass, OtherSchemaClass, @@ -483,9 +485,64 @@ def to_datahub_dataset( Constant.DATASET, dataset.tags, ) + self.extract_profile(dataset_mcps, workspace, dataset, table, ds_urn) return dataset_mcps + def extract_profile( + self, + dataset_mcps: List[MetadataChangeProposalWrapper], + workspace: powerbi_data_classes.Workspace, + dataset: powerbi_data_classes.PowerBIDataset, + table: powerbi_data_classes.Table, + ds_urn: str, + ) -> None: + if not self.__config.profiling.enabled: + # Profiling not enabled + return + + if not self.__config.profile_pattern.allowed( + f"{workspace.name}.{dataset.name}.{table.name}" + ): + logger.info( + f"Table {table.name} in {dataset.name}, not allowed for profiling" + ) + return + logger.debug(f"Profiling table: {table.name}") + + profile = DatasetProfileClass(timestampMillis=builder.get_sys_time()) + profile.rowCount = table.row_count + profile.fieldProfiles = [] + + columns: List[ + Union[powerbi_data_classes.Column, powerbi_data_classes.Measure] + ] = [*(table.columns or []), *(table.measures or [])] + for column in columns: + allowed_column = self.__config.profile_pattern.allowed( + f"{workspace.name}.{dataset.name}.{table.name}.{column.name}" + ) + if column.isHidden or not allowed_column: + logger.info(f"Column {column.name} not allowed for profiling") + continue + measure_profile = column.measure_profile + if measure_profile: + field_profile = DatasetFieldProfileClass(column.name or "") + field_profile.sampleValues = measure_profile.sample_values + field_profile.min = measure_profile.min + field_profile.max = measure_profile.max + field_profile.uniqueCount = measure_profile.unique_count + profile.fieldProfiles.append(field_profile) + + profile.columnCount = table.column_count + + mcp = MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=ds_urn, + aspectName="datasetProfile", + aspect=profile, + ) + dataset_mcps.append(mcp) + @staticmethod def transform_tags(tags: List[str]) -> GlobalTagsClass: return GlobalTagsClass( @@ -1180,6 +1237,10 @@ def report_to_datahub_work_units( SourceCapability.LINEAGE_FINE, "Disabled by default, configured using `extract_column_level_lineage`. ", ) +@capability( + SourceCapability.DATA_PROFILING, + "Optionally enabled via configuration profiling.enabled", +) class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py index ce4dd9a7a0c0f6..6e8d939325d5b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py @@ -85,6 +85,14 @@ def __hash__(self): return hash(self.__members()) +@dataclass +class MeasureProfile: + min: Optional[str] = None + max: Optional[str] = None + unique_count: Optional[int] = None + sample_values: Optional[List[str]] = None + + @dataclass class Column: name: str @@ -96,6 +104,7 @@ class Column: columnType: Optional[str] = None expression: Optional[str] = None description: Optional[str] = None + measure_profile: Optional[MeasureProfile] = None @dataclass @@ -108,6 +117,7 @@ class Measure: BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, StringTypeClass ] = dataclasses.field(default_factory=NullTypeClass) description: Optional[str] = None + measure_profile: Optional[MeasureProfile] = None @dataclass @@ -117,6 +127,8 @@ class Table: expression: Optional[str] = None columns: Optional[List[Column]] = None measures: Optional[List[Measure]] = None + row_count: Optional[int] = None + column_count: Optional[int] = None # Pointer to the parent dataset. dataset: Optional["PowerBIDataset"] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index fadd7a48b62f70..b190cf065b6e33 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from datetime import datetime, timedelta from time import sleep -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import msal import requests @@ -11,18 +11,27 @@ from requests.adapters import HTTPAdapter from urllib3 import Retry -from datahub.configuration.common import ConfigurationError +from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.ingestion.source.powerbi.config import Constant from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import ( + Column, Dashboard, + Measure, + MeasureProfile, Page, PowerBIDataset, Report, + Table, Tile, User, Workspace, new_powerbi_dataset, ) +from datahub.ingestion.source.powerbi.rest_api_wrapper.profiling_utils import ( + process_column_result, + process_sample_result, +) +from datahub.ingestion.source.powerbi.rest_api_wrapper.query import DaxQuery # Logger instance logger = logging.getLogger(__name__) @@ -107,6 +116,16 @@ def get_tiles_endpoint(self, workspace: Workspace, dashboard_id: str) -> str: def _get_pages_by_report(self, workspace: Workspace, report_id: str) -> List[Page]: pass + @abstractmethod + def profile_dataset( + self, + dataset: PowerBIDataset, + table: Table, + workspace_name: str, + profile_pattern: Optional[AllowDenyPattern], + ) -> None: + pass + @abstractmethod def get_dataset( self, workspace_id: str, dataset_id: str @@ -387,6 +406,7 @@ class RegularAPIResolver(DataResolverBase): Constant.REPORT_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports/{REPORT_ID}", Constant.REPORT_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports", Constant.PAGE_BY_REPORT: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports/{REPORT_ID}/pages", + Constant.DATASET_EXECUTE_QUERIES: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/datasets/{DATASET_ID}/executeQueries", } def get_dataset( @@ -515,6 +535,134 @@ def _get_pages_by_report(self, workspace: Workspace, report_id: str) -> List[Pag def get_users(self, workspace_id: str, entity: str, entity_id: str) -> List[User]: return [] # User list is not available in regular access + def _execute_profiling_query(self, dataset: PowerBIDataset, query: str) -> dict: + dataset_query_endpoint: str = self.API_ENDPOINTS[ + Constant.DATASET_EXECUTE_QUERIES + ] + # Replace place holders + dataset_query_endpoint = dataset_query_endpoint.format( + POWERBI_BASE_URL=self.BASE_URL, + WORKSPACE_ID=dataset.workspace_id, + DATASET_ID=dataset.id, + ) + # Hit PowerBi + logger.info(f"Request to query endpoint URL={dataset_query_endpoint}") + + # Serializer is configured to include nulls so that the queried fields + # exist in the returned payloads. Only failed queries will result in KeyError + payload = { + "queries": [ + { + "query": query, + } + ], + "serializerSettings": { + "includeNulls": True, + }, + } + response = self._request_session.post( + dataset_query_endpoint, + json=payload, + headers=self.get_authorization_header(), + ) + response.raise_for_status() + return response.json() + + def _get_row_count(self, dataset: PowerBIDataset, table: Table) -> int: + query = DaxQuery.row_count_query(table.name) + try: + data = self._execute_profiling_query(dataset, query) + rows = data["results"][0]["tables"][0]["rows"] + count = rows[0]["[count]"] + return count + except requests.exceptions.RequestException as ex: + logger.warning(getattr(ex.response, "text", "")) + logger.warning( + f"Profiling failed for getting row count for dataset {dataset.id}, with status code {getattr(ex.response, 'status_code', None)}", + ) + except (KeyError, IndexError) as ex: + logger.warning( + f"Profiling failed for getting row count for dataset {dataset.id}, with {ex}" + ) + return 0 + + def _get_data_sample(self, dataset: PowerBIDataset, table: Table) -> dict: + try: + query = DaxQuery.data_sample_query(table.name) + data = self._execute_profiling_query(dataset, query) + return process_sample_result(data) + except requests.exceptions.RequestException as ex: + logger.warning(getattr(ex.response, "text", "")) + logger.warning( + f"Getting sample with TopN failed for dataset {dataset.id}, with status code {getattr(ex.response, 'status_code', None)}", + ) + except (KeyError, IndexError) as ex: + logger.warning( + f"Getting sample with TopN failed for dataset {dataset.id}, with {ex}" + ) + return {} + + def _get_column_data( + self, dataset: PowerBIDataset, table: Table, column: Union[Column, Measure] + ) -> dict: + try: + logger.debug(f"Column data query for {dataset.name}, {column.name}") + query = DaxQuery.column_data_query(table.name, column.name) + data = self._execute_profiling_query(dataset, query) + return process_column_result(data) + except requests.exceptions.RequestException as ex: + logger.warning(getattr(ex.response, "text", "")) + logger.warning( + f"Getting column statistics failed for dataset {dataset.name}, {column.name}, with status code {getattr(ex.response, 'status_code', None)}", + ) + except (KeyError, IndexError) as ex: + logger.warning( + f"Getting column statistics failed for dataset {dataset.name}, {column.name}, with {ex}" + ) + return {} + + def profile_dataset( + self, + dataset: PowerBIDataset, + table: Table, + workspace_name: str, + profile_pattern: Optional[AllowDenyPattern], + ) -> None: + if not profile_pattern: + logger.info("Profile pattern not configured, not profiling") + return + + if not profile_pattern.allowed(f"{workspace_name}.{dataset.name}.{table.name}"): + logger.info( + f"Table {table.name} in {dataset.name}, not allowed for profiling" + ) + return + + logger.info(f"Profiling table: {table.name}") + row_count = self._get_row_count(dataset, table) + sample = self._get_data_sample(dataset, table) + + table.row_count = row_count + column_count = 0 + + columns: List[Union[Column, Measure]] = [ + *(table.columns or []), + *(table.measures or []), + ] + for column in columns: + if column.isHidden: + continue + + column_sample = sample.get(column.name, None) if sample else None + column_stats = self._get_column_data(dataset, table, column) + + column.measure_profile = MeasureProfile( + sample_values=column_sample, **column_stats + ) + column_count += 1 + + table.column_count = column_count + class AdminAPIResolver(DataResolverBase): # Admin access endpoints @@ -817,3 +965,13 @@ def get_dataset_parameters( ) -> Dict[str, str]: logger.debug("Get dataset parameter is unsupported in Admin API") return {} + + def profile_dataset( + self, + dataset: PowerBIDataset, + table: Table, + workspace_name: str, + profile_pattern: Optional[AllowDenyPattern], + ) -> None: + logger.debug("Profile dataset is unsupported in Admin API") + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py index d72624bd70512a..7ce0dcb2e3cf49 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py @@ -287,11 +287,12 @@ def _parse_endorsement(endorsements: Optional[dict]) -> List[str]: return [endorsement] - def _get_workspace_datasets(self, scan_result: Optional[dict]) -> dict: + def _get_workspace_datasets(self, workspace: Workspace) -> dict: """ Filter out "dataset" from scan_result and return Dataset instance set """ dataset_map: dict = {} + scan_result = workspace.scan_result if scan_result is None: return dataset_map @@ -345,30 +346,37 @@ def _get_workspace_datasets(self, scan_result: Optional[dict]) -> dict: and len(table[Constant.SOURCE]) > 0 else None ) - dataset_instance.tables.append( - Table( - name=table[Constant.NAME], - full_name="{}.{}".format( - dataset_name.replace(" ", "_"), - table[Constant.NAME].replace(" ", "_"), - ), - expression=expression, - columns=[ - Column( - **column, - datahubDataType=FIELD_TYPE_MAPPING.get( - column["dataType"], FIELD_TYPE_MAPPING["Null"] - ), - ) - for column in table.get("columns", []) - ], - measures=[ - Measure(**measure) for measure in table.get("measures", []) - ], - dataset=dataset_instance, - ) + table = Table( + name=table[Constant.NAME], + full_name="{}.{}".format( + dataset_name.replace(" ", "_"), + table[Constant.NAME].replace(" ", "_"), + ), + expression=expression, + columns=[ + Column( + **column, + datahubDataType=FIELD_TYPE_MAPPING.get( + column["dataType"], FIELD_TYPE_MAPPING["Null"] + ), + ) + for column in table.get("columns", []) + ], + measures=[ + Measure(**measure) for measure in table.get("measures", []) + ], + dataset=dataset_instance, + row_count=None, + column_count=None, ) - + if self.__config.profiling.enabled: + self._get_resolver().profile_dataset( + dataset_instance, + table, + workspace.name, + self.__config.profile_pattern, + ) + dataset_instance.tables.append(table) return dataset_map def _fill_metadata_from_scan_result( @@ -393,9 +401,7 @@ def _fill_metadata_from_scan_result( independent_datasets=[], ) cur_workspace.scan_result = workspace_metadata - cur_workspace.datasets = self._get_workspace_datasets( - cur_workspace.scan_result - ) + cur_workspace.datasets = self._get_workspace_datasets(cur_workspace) # Fetch endorsements tag if it is enabled from configuration if self.__config.extract_endorsements_to_tags: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py new file mode 100644 index 00000000000000..35e4cea41264ad --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py @@ -0,0 +1,46 @@ +import re +from typing import Dict, List, Optional + + +def get_column_name(table_and_col: str) -> Optional[str]: + regex = re.compile(".*\\[(.*)\\]$") + m = regex.match(table_and_col) + if m: + return m.group(1) + return None + + +def process_sample_result(result_data: dict) -> dict: + sample_data_by_column: Dict[str, List[str]] = {} + rows = result_data["results"][0]["tables"][0]["rows"] + for sample in rows: + for key, value in sample.items(): + if not value: + continue + column_name = get_column_name(key) + + if not column_name: + continue + + if column_name not in sample_data_by_column: + sample_data_by_column[column_name] = [] + sample_data_by_column[column_name].append(str(value)) + return sample_data_by_column + + +def process_column_result(result_data: dict) -> dict: + sample_data_by_column: Dict[str, str] = {} + rows = result_data["results"][0]["tables"][0]["rows"] + for sample in rows: + for key, value in sample.items(): + if not value: + continue + column_name = get_column_name(key) + + if not column_name: + continue + + if column_name != "unique_count": + value = str(value) + sample_data_by_column[column_name] = value + return sample_data_by_column diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/query.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/query.py new file mode 100644 index 00000000000000..cb66210efc8e56 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/query.py @@ -0,0 +1,17 @@ +class DaxQuery: + @staticmethod + def data_sample_query(table_name: str) -> str: + return f"EVALUATE TOPN(3, '{table_name}')" + + @staticmethod + def column_data_query(table_name: str, column_name: str) -> str: + return f""" + EVALUATE ROW( + "min", MIN('{table_name}'[{column_name}]), + "max", MAX('{table_name}'[{column_name}]), + "unique_count", COUNTROWS ( DISTINCT ( '{table_name}'[{column_name}] ) ) + )""" + + @staticmethod + def row_count_query(table_name: str) -> str: + return f"""EVALUATE ROW("count", COUNTROWS ( '{table_name}' ))""" diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_profiling.json b/metadata-ingestion/tests/integration/powerbi/golden_test_profiling.json new file mode 100644 index 00000000000000..580a8d1a1db119 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_profiling.json @@ -0,0 +1,170 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.articles,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = PostgreSQL.Database(\"localhost\" , \"mics\" ),\n public_order_date = Source{[Schema=\"public\",Item=\"order_date\"]}[Data] \n in \n public_order_date", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1645599600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.articles,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "articles", + "description": "Library Dataset", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1645599600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.articles,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1645599600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 542300, + "columnCount": 4, + "fieldProfiles": [ + { + "fieldPath": "link", + "uniqueCount": 15, + "min": "3", + "max": "34333", + "sampleValues": [ + "http://example.org", + "http://example.org/111/22/foo", + "http://example.org/111/22" + ] + }, + { + "fieldPath": "description", + "uniqueCount": 15, + "min": "3", + "max": "34333", + "sampleValues": [ + "this is a sample", + "this describes content", + "sample, this is" + ] + }, + { + "fieldPath": "topic", + "uniqueCount": 15, + "min": "3", + "max": "34333", + "sampleValues": [ + "urgent matters", + "urgent matters", + "normal matters" + ] + }, + { + "fieldPath": "view_count", + "uniqueCount": 15, + "min": "3", + "max": "34333", + "sampleValues": [ + "123455", + "123455", + "123455" + ] + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1645599600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.articles,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1645599600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.articles,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1645599600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.articles,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1645599600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_profiling.py b/metadata-ingestion/tests/integration/powerbi/test_profiling.py new file mode 100644 index 00000000000000..7955386de8940b --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/test_profiling.py @@ -0,0 +1,338 @@ +import logging +import sys +from typing import Any, Dict +from unittest import mock + +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import mce_helpers + +FROZEN_TIME = "2022-02-23 07:00:00" + + +def scan_init_response(request, context): + # Request mock is passing POST input in the form of workspaces= + workspace_id = request.text.split("=")[1] + + w_id_vs_response: Dict[str, Any] = { + "64ED5CAD-7C10-4684-8180-826122881108": { + "id": "4674efd1-603c-4129-8d82-03cf2be05aff" + } + } + + return w_id_vs_response[workspace_id] + + +def admin_datasets_response(request, context): + return { + "value": [ + { + "id": "05169CD2-E713-41E6-9600-1D8066D95445", + "name": "library-dataset", + "webUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445", + } + ] + } + + +def execute_queries_response(request, context): + query = request.json()["queries"][0]["query"] + if "unique_count" in query: + return { + "results": [ + { + "tables": [ + { + "rows": [ + { + "[min]": 3, + "[max]": 34333, + "[unique_count]": 15, + }, + ] + } + ] + } + ], + } + elif "COUNTROWS" in query: + return { + "results": [ + { + "tables": [ + { + "rows": [ + { + "[count]": 542300, + }, + ] + } + ] + } + ], + } + elif "TOPN" in query: + return { + "results": [ + { + "tables": [ + { + "rows": [ + { + "[link]": "http://example.org", + "[description]": "this is a sample", + "[topic]": "urgent matters", + "[view_count]": 123455, + }, + { + "[link]": "http://example.org/111/22/foo", + "[description]": "this describes content", + "[topic]": "urgent matters", + "[view_count]": 123455, + }, + { + "[link]": "http://example.org/111/22", + "[description]": "sample, this is", + "[topic]": "normal matters", + "[view_count]": 123455, + }, + ] + } + ] + } + ], + } + + +def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None: + api_vs_response = { + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets": { + "method": "GET", + "status_code": 200, + "json": admin_datasets_response, + }, + "https://api.powerbi.com/v1.0/myorg/groups?%24top=1000&%24skip=0&%24filter=type+eq+%27Workspace%27": { + "method": "GET", + "status_code": 200, + "json": { + "@odata.count": 3, + "value": [ + { + "id": "64ED5CAD-7C10-4684-8180-826122881108", + "isReadOnly": True, + "name": "demo-workspace", + "type": "Workspace", + } + ], + }, + }, + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/dashboards": { + "method": "GET", + "status_code": 200, + "json": {"value": []}, + }, + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/datasources": { + "method": "GET", + "status_code": 200, + "json": { + "value": [ + { + "datasourceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + "datasourceType": "PostgreSql", + "connectionDetails": { + "database": "library_db", + "server": "foo", + }, + }, + ] + }, + }, + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/executeQueries": { + "method": "POST", + "status_code": 200, + "json": execute_queries_response, + }, + "https://api.powerbi.com/v1.0/myorg/admin/workspaces/scanStatus/4674efd1-603c-4129-8d82-03cf2be05aff": { + "method": "GET", + "status_code": 200, + "json": { + "status": "SUCCEEDED", + }, + }, + "https://api.powerbi.com/v1.0/myorg/admin/workspaces/scanStatus/a674efd1-603c-4129-8d82-03cf2be05aff": { + "method": "GET", + "status_code": 200, + "json": { + "status": "SUCCEEDED", + }, + }, + "https://api.powerbi.com/v1.0/myorg/admin/workspaces/scanResult/4674efd1-603c-4129-8d82-03cf2be05aff": { + "method": "GET", + "status_code": 200, + "json": { + "workspaces": [ + { + "id": "64ED5CAD-7C10-4684-8180-826122881108", + "name": "demo-workspace", + "state": "Active", + "datasets": [ + { + "id": "05169CD2-E713-41E6-9600-1D8066D95445", + "endorsementDetails": {"endorsement": "Promoted"}, + "name": "test_sf_pbi_test", + "tables": [ + { + "name": "articles", + "source": [ + { + "expression": 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + "columns": [ + { + "name": "link", + "description": "column description", + "dataType": "String", + "columnType": "DATA", + "isHidden": False, + }, + { + "name": "description", + "description": "column description", + "dataType": "String", + "columnType": "DATA", + "isHidden": False, + }, + { + "name": "topic", + "description": "column description", + "dataType": "String", + "columnType": "DATA", + "isHidden": False, + }, + ], + "measures": [ + { + "name": "view_count", + "description": "column description", + "expression": "let\n x", + "isHidden": False, + } + ], + }, + ], + }, + ], + "dashboards": [], + "reports": [], + }, + ] + }, + }, + "https://api.powerbi.com/v1.0/myorg/admin/workspaces/getInfo": { + "method": "POST", + "status_code": 200, + "json": scan_init_response, + }, + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445": { + "method": "GET", + "status_code": 200, + "json": { + "id": "05169CD2-E713-41E6-9600-1D8066D95445", + "name": "library-dataset", + "description": "Library Dataset", + "webUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445", + }, + }, + } + + api_vs_response.update(override_data) + + for url in api_vs_response.keys(): + request_mock.register_uri( + api_vs_response[url]["method"], + url, + json=api_vs_response[url]["json"], + status_code=api_vs_response[url]["status_code"], + ) + + +def enable_logging(): + # set logging to console + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logging.getLogger().setLevel(logging.DEBUG) + + +def mock_msal_cca(*args, **kwargs): + class MsalClient: + def acquire_token_for_client(self, *args, **kwargs): + return { + "access_token": "dummy", + } + + return MsalClient() + + +def default_source_config(): + return { + "client_id": "foo", + "client_secret": "bar", + "tenant_id": "0B0C960B-FCDF-4D0F-8C45-2E03BB59DDEB", + "workspace_id": "64ED5CAD-7C10-4684-8180-826122881108", + "extract_lineage": True, + "extract_reports": False, + "admin_apis_only": False, + "extract_ownership": True, + "convert_lineage_urns_to_lowercase": False, + "extract_independent_datasets": True, + "workspace_id_pattern": {"allow": ["64ED5CAD-7C10-4684-8180-826122881108"]}, + "extract_workspaces_to_containers": False, + "profiling": { + "enabled": True, + }, + "profile_pattern": {"allow": [".*"]}, + } + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_profiling(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): + enable_logging() + + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + register_mock_admin_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_profiling.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_profiling.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/powerbi_profiling.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) From 2453da83488b947884ac0c99717e1ec14fca168b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Fri, 28 Jun 2024 23:55:03 +0200 Subject: [PATCH 09/33] fix(ui): show external url also in entity profile of containers (#10390) Co-authored-by: Harshal Sheth --- datahub-web-react/src/graphql/container.graphql | 1 + 1 file changed, 1 insertion(+) diff --git a/datahub-web-react/src/graphql/container.graphql b/datahub-web-react/src/graphql/container.graphql index efeca6da5e6ddb..749c1c9172b6d5 100644 --- a/datahub-web-react/src/graphql/container.graphql +++ b/datahub-web-react/src/graphql/container.graphql @@ -9,6 +9,7 @@ query getContainer($urn: String!) { properties { name description + externalUrl customProperties { key value From 0b09181bc173b77baa6dcb7233e194e47eb255bb Mon Sep 17 00:00:00 2001 From: Raj Tekal Date: Fri, 28 Jun 2024 18:01:33 -0400 Subject: [PATCH 10/33] fix(ERModelRelationship) UUID should mimic datahub_guid.py (#10355) --- .../CreateERModelRelationshipResolver.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/ermodelrelationship/CreateERModelRelationshipResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/ermodelrelationship/CreateERModelRelationshipResolver.java index 61896ed1a0659f..cafd0b5ab082b2 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/ermodelrelationship/CreateERModelRelationshipResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/ermodelrelationship/CreateERModelRelationshipResolver.java @@ -54,14 +54,15 @@ public CompletableFuture get(DataFetchingEnvironment enviro highDataset = source; } // The following sequence mimics datahub.emitter.mce_builder.datahub_guid + // Keys have to be in alphabetical order - Destination, ERModelRelationName and Source String ermodelrelationKey = - "{\"Source\":\"" + "{\"Destination\":\"" + lowDataset - + "\",\"Destination\":\"" - + highDataset + "\",\"ERModelRelationName\":\"" + ermodelrelationName + + "\",\"Source\":\"" + + highDataset + "\"}"; byte[] mybytes = ermodelrelationKey.getBytes(StandardCharsets.UTF_8); From 1fafc6c56c39e7025b974427ad4f18f0f22cd285 Mon Sep 17 00:00:00 2001 From: Sukeerthi Mandyam <69302927+Sukeerthi31@users.noreply.github.com> Date: Sat, 29 Jun 2024 03:37:26 +0530 Subject: [PATCH 11/33] chore(vulnerability): Inefficient Regular Expression - Potential high time complexity leading to ReDoS (#10315) --- datahub-web-react/src/app/ingest/source/utils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/source/utils.ts b/datahub-web-react/src/app/ingest/source/utils.ts index 43b03b82e4babd..49cc8a108e69e2 100644 --- a/datahub-web-react/src/app/ingest/source/utils.ts +++ b/datahub-web-react/src/app/ingest/source/utils.ts @@ -132,7 +132,7 @@ export const getExecutionRequestStatusDisplayColor = (status: string) => { export const validateURL = (fieldName: string) => { return { validator(_, value) { - const URLPattern = new RegExp(/^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!$&'()*+,;=.]+$/); + const URLPattern = new RegExp(/^(?:http(s)?:\/\/)?[\w.-]+(?:\.[a-zA-Z0-9.-]{2,})+[\w\-._~:/?#[\]@!$&'()*+,;=.]+$/); const isURLValid = URLPattern.test(value); if (!value || isURLValid) { return Promise.resolve(); From 19336fec50b1515e6a2ebb262e265245932186b7 Mon Sep 17 00:00:00 2001 From: Pinaki Bhattacharjee Date: Sat, 29 Jun 2024 03:38:16 +0530 Subject: [PATCH 12/33] chore(vulnerability): Bumped up reactour version to address high vulnerability (#10218) --- datahub-web-react/package.json | 2 +- datahub-web-react/yarn.lock | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json index e2bc48164e77da..ca53932eba5189 100644 --- a/datahub-web-react/package.json +++ b/datahub-web-react/package.json @@ -71,7 +71,7 @@ "react-router-dom": "^5.3", "react-syntax-highlighter": "^15.4.4", "react-visibility-sensor": "^5.1.1", - "reactour": "1.18.7", + "reactour": "^1.19.3", "remirror": "^2.0.23", "styled-components": "^5.2.1", "turndown-plugin-gfm": "^1.0.2", diff --git a/datahub-web-react/yarn.lock b/datahub-web-react/yarn.lock index 9b6503f1c4181e..4268acf155de1d 100644 --- a/datahub-web-react/yarn.lock +++ b/datahub-web-react/yarn.lock @@ -7332,11 +7332,6 @@ lodash.merge@^4.6.2: resolved "https://registry.yarnpkg.com/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a" integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ== -lodash.pick@4.4.0: - version "4.4.0" - resolved "https://registry.npmjs.org/lodash.pick/-/lodash.pick-4.4.0.tgz#52f05610fff9ded422611441ed1fc123a03001b3" - integrity sha1-UvBWEP/53tQiYRRB7R/BI6AwAbM= - lodash@^4.0.1, lodash@^4.17.11, lodash@^4.17.15, lodash@^4.17.20, lodash@^4.17.21, lodash@~4.17.0: version "4.17.21" resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c" @@ -9163,16 +9158,15 @@ reactcss@^1.2.0: dependencies: lodash "^4.0.1" -reactour@1.18.7: - version "1.18.7" - resolved "https://registry.yarnpkg.com/reactour/-/reactour-1.18.7.tgz#86a61869890bc3444c2bf412d30833eb85352525" - integrity sha512-kkXy4h5+fieNPzrPYdWiLj6afl+xH2NQw4En9XJD9EwCIGsRmfcppyJ1xwqJDC6JRhPttf+5wUIsUoLSOlk/Ag== +reactour@^1.19.3: + version "1.19.3" + resolved "https://registry.yarnpkg.com/reactour/-/reactour-1.19.3.tgz#d83877496a3af211a881fba3709276accf2ac3e7" + integrity sha512-a2/j+xmbGlPMQhMy/zdpSmJN1STc0PcLUxhJbAC+bOcYdWHStFR02j6dhNuF/+IeqpX/d44PvlrRXSTt4LGk6w== dependencies: "@rooks/use-mutation-observer" "4.11.2" classnames "2.3.1" focus-outline-manager "^1.0.2" lodash.debounce "4.0.8" - lodash.pick "4.4.0" prop-types "15.7.2" react-focus-lock "2.5.2" scroll-smooth "1.1.1" From b4b8a23302c733d9944964ff151a432a18a760ae Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 13/33] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From a35089048abfeef24aef30767e1d0b722b3786dd Mon Sep 17 00:00:00 2001 From: John Joyce Date: Fri, 28 Jun 2024 15:21:37 -0700 Subject: [PATCH 14/33] feat(backend): Add new PDL entities + models for persona capture (#9637) Co-authored-by: John Joyce --- .../linkedin/datahub/graphql/Constants.java | 10 ++++++ .../datahub/graphql/GmsGraphQLEngine.java | 14 +++++++++ .../graphql/types/corpuser/CorpUserType.java | 19 +++++++++++- .../mappers/CorpUserEditableInfoMapper.java | 19 ++++++++++++ .../src/main/resources/entity.graphql | 31 +++++++++++++++++++ .../identity/CorpUserEditableInfo.pdl | 21 +++++++++++++ .../metadata/key/DataHubPersonaKey.pdl | 14 +++++++++ .../linkedin/persona/DataHubPersonaInfo.pdl | 10 ++++++ .../src/main/resources/entity-registry.yml | 5 +++ .../com.linkedin.entity.aspects.snapshot.json | 23 ++++++++++++++ ...com.linkedin.entity.entities.snapshot.json | 23 ++++++++++++++ .../com.linkedin.entity.runs.snapshot.json | 23 ++++++++++++++ ...nkedin.operations.operations.snapshot.json | 23 ++++++++++++++ ...m.linkedin.platform.platform.snapshot.json | 23 ++++++++++++++ 14 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubPersonaKey.pdl create mode 100644 metadata-models/src/main/pegasus/com/linkedin/persona/DataHubPersonaInfo.pdl diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java index 6fc6edc66f3572..f70c46ba943a5a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java @@ -1,5 +1,8 @@ package com.linkedin.datahub.graphql; +import com.google.common.collect.ImmutableSet; +import java.util.Set; + /** Constants relating to GraphQL type system & execution. */ public class Constants { @@ -28,4 +31,11 @@ private Constants() {} public static final String BROWSE_PATH_V2_DELIMITER = "␟"; public static final String VERSION_STAMP_FIELD_NAME = "versionStamp"; public static final String ENTITY_FILTER_NAME = "_entityType"; + + public static final Set DEFAULT_PERSONA_URNS = + ImmutableSet.of( + "urn:li:dataHubPersona:technicalUser", + "urn:li:dataHubPersona:businessUser", + "urn:li:dataHubPersona:dataLeader", + "urn:li:dataHubPersona:dataSteward"); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index b17e4bd386bdac..6f2e250c17c34e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -42,6 +42,7 @@ import com.linkedin.datahub.graphql.generated.CorpGroup; import com.linkedin.datahub.graphql.generated.CorpGroupInfo; import com.linkedin.datahub.graphql.generated.CorpUser; +import com.linkedin.datahub.graphql.generated.CorpUserEditableProperties; import com.linkedin.datahub.graphql.generated.CorpUserInfo; import com.linkedin.datahub.graphql.generated.CorpUserViewsSettings; import com.linkedin.datahub.graphql.generated.Dashboard; @@ -53,6 +54,7 @@ import com.linkedin.datahub.graphql.generated.DataHubView; import com.linkedin.datahub.graphql.generated.DataJob; import com.linkedin.datahub.graphql.generated.DataJobInputOutput; +import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DataPlatformInstance; import com.linkedin.datahub.graphql.generated.DataQualityContract; import com.linkedin.datahub.graphql.generated.Dataset; @@ -1823,6 +1825,18 @@ private void configureCorpUserResolvers(final RuntimeWiring.Builder builder) { new LoadableTypeResolver<>( corpUserType, (env) -> ((CorpUserInfo) env.getSource()).getManager().getUrn()))); + builder.type( + "CorpUserEditableProperties", + typeWiring -> + typeWiring.dataFetcher( + "platforms", + new LoadableTypeBatchResolver<>( + dataPlatformType, + (env) -> + ((CorpUserEditableProperties) env.getSource()) + .getPlatforms().stream() + .map(DataPlatform::getUrn) + .collect(Collectors.toList())))); } /** diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/CorpUserType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/CorpUserType.java index b1ce42e72482a5..3c2bfd7225edf5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/CorpUserType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/CorpUserType.java @@ -1,11 +1,13 @@ package com.linkedin.datahub.graphql.types.corpuser; +import static com.linkedin.datahub.graphql.Constants.DEFAULT_PERSONA_URNS; import static com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils.*; import static com.linkedin.metadata.Constants.*; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; import com.google.common.collect.ImmutableList; +import com.linkedin.common.UrnArray; import com.linkedin.common.url.Url; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -14,6 +16,8 @@ import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.exception.AuthorizationException; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode; +import com.linkedin.datahub.graphql.exception.DataHubGraphQLException; import com.linkedin.datahub.graphql.featureflags.FeatureFlags; import com.linkedin.datahub.graphql.generated.AutoCompleteResults; import com.linkedin.datahub.graphql.generated.CorpUser; @@ -246,7 +250,20 @@ private RecordTemplate mapCorpUserEditableInfo( if (input.getEmail() != null) { result.setEmail(input.getEmail()); } - + if (input.getPlatformUrns() != null) { + result.setPlatforms( + new UrnArray( + input.getPlatformUrns().stream().map(UrnUtils::getUrn).collect(Collectors.toList()))); + } + if (input.getPersonaUrn() != null) { + if (DEFAULT_PERSONA_URNS.contains(input.getPersonaUrn())) { + result.setPersona(UrnUtils.getUrn(input.getPersonaUrn())); + } else { + throw new DataHubGraphQLException( + String.format("Provided persona urn %s does not exist", input.getPersonaUrn()), + DataHubGraphQLErrorCode.NOT_FOUND); + } + } return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/mappers/CorpUserEditableInfoMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/mappers/CorpUserEditableInfoMapper.java index 1ff2f069b8112c..38f3c75d7a9fa8 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/mappers/CorpUserEditableInfoMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpuser/mappers/CorpUserEditableInfoMapper.java @@ -2,7 +2,10 @@ import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.CorpUserEditableProperties; +import com.linkedin.datahub.graphql.generated.DataHubPersona; +import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -38,6 +41,22 @@ public CorpUserEditableProperties apply( if (info.hasPictureLink()) { result.setPictureLink(info.getPictureLink().toString()); } + if (info.hasPlatforms()) { + result.setPlatforms( + info.getPlatforms().stream() + .map( + urn -> { + DataPlatform platform = new DataPlatform(); + platform.setUrn(urn.toString()); + return platform; + }) + .collect(Collectors.toList())); + } + if (info.hasPersona()) { + DataHubPersona persona = new DataHubPersona(); + persona.setUrn(info.getPersona().toString()); + result.setPersona(persona); + } return result; } } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index d48a9976e15d77..89c7b4a4cd0556 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -4139,6 +4139,16 @@ type CorpUserEditableProperties { Email address for the user """ email: String + + """ + User persona, if present + """ + persona: DataHubPersona + + """ + Platforms commonly used by the user, if present. + """ + platforms: [DataPlatform!] } """ @@ -4189,6 +4199,16 @@ input CorpUserUpdateInput { Email address for the user """ email: String + + """ + The platforms that the user frequently works with + """ + platformUrns: [String!] + + """ + The user's persona urn" + """ + personaUrn: String } """ @@ -12142,6 +12162,7 @@ input CreateDataProductPropertiesInput { description: String } + """ Input properties required for update a DataProduct """ @@ -12307,6 +12328,16 @@ input UpdateOwnershipTypeInput { description: String } +""" +A standardized type of a user +""" +type DataHubPersona { + """ + The urn of the persona type + """ + urn: String! +} + """ Describes a generic filter on a dataset """ diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl index 48ee53377e5820..9667c93c8b7709 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl @@ -1,6 +1,7 @@ namespace com.linkedin.identity import com.linkedin.common.Url +import com.linkedin.common.Urn /** * Linkedin corp user information that can be edited from UI @@ -56,6 +57,26 @@ record CorpUserEditableInfo { */ title: optional string + /** + * The platforms that the user commonly works with + */ + @Relationship = { + "/*": { + "name": "IsUserOf", + "entityTypes": ["dataPlatform"] + } + } + platforms: optional array[Urn] + + /** + * The user's persona type, based on their role + */ + @Relationship = { + "name": "IsPersona", + "entityTypes": ["dataHubPersona"] + } + persona: optional Urn + /** * Slack handle for the user */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubPersonaKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubPersonaKey.pdl new file mode 100644 index 00000000000000..296444221af3a5 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubPersonaKey.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.metadata.key + +/** + * Key for a persona type + */ +@Aspect = { + "name": "dataHubPersonaKey" +} +record DataHubPersonaKey { + /** + * A unique id for the persona type + */ + id: string +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/persona/DataHubPersonaInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/persona/DataHubPersonaInfo.pdl new file mode 100644 index 00000000000000..fb422993b27ddc --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/persona/DataHubPersonaInfo.pdl @@ -0,0 +1,10 @@ +namespace com.linkedin.persona + +/** + * Placeholder aspect for persona type info + */ +@Aspect = { + "name": "dataHubPersonaInfo" +} +record DataHubPersonaInfo { +} diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index ed19cd3a1d4860..6a6683418bf386 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -542,6 +542,11 @@ entities: - dataContractProperties - dataContractStatus - status + - name: dataHubPersona + category: internal + keyAspect: dataHubPersonaKey + aspects: + - dataHubPersonaInfo - name: entityType doc: A type of entity in the DataHub Metadata Model. category: core diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index bfa887ffda1175..c40137b265cff0 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -2353,6 +2353,29 @@ "type" : "string", "doc" : "DataHub-native Title, e.g. 'Software Engineer'", "optional" : true + }, { + "name" : "platforms", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "The platforms that the user commonly works with", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataPlatform" ], + "name" : "IsUserOf" + } + } + }, { + "name" : "persona", + "type" : "com.linkedin.common.Urn", + "doc" : "The user's persona type, based on their role", + "optional" : true, + "Relationship" : { + "entityTypes" : [ "dataHubPersona" ], + "name" : "IsPersona" + } }, { "name" : "slack", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 5dcedfecf99ca4..aeb5fbef5af2f2 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -2766,6 +2766,29 @@ "type" : "string", "doc" : "DataHub-native Title, e.g. 'Software Engineer'", "optional" : true + }, { + "name" : "platforms", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "The platforms that the user commonly works with", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataPlatform" ], + "name" : "IsUserOf" + } + } + }, { + "name" : "persona", + "type" : "com.linkedin.common.Urn", + "doc" : "The user's persona type, based on their role", + "optional" : true, + "Relationship" : { + "entityTypes" : [ "dataHubPersona" ], + "name" : "IsPersona" + } }, { "name" : "slack", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index a665548fcd078d..18ef55011ed5af 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -2086,6 +2086,29 @@ "type" : "string", "doc" : "DataHub-native Title, e.g. 'Software Engineer'", "optional" : true + }, { + "name" : "platforms", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "The platforms that the user commonly works with", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataPlatform" ], + "name" : "IsUserOf" + } + } + }, { + "name" : "persona", + "type" : "com.linkedin.common.Urn", + "doc" : "The user's persona type, based on their role", + "optional" : true, + "Relationship" : { + "entityTypes" : [ "dataHubPersona" ], + "name" : "IsPersona" + } }, { "name" : "slack", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index e08a6eecd0e6e3..cf059788209119 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -2080,6 +2080,29 @@ "type" : "string", "doc" : "DataHub-native Title, e.g. 'Software Engineer'", "optional" : true + }, { + "name" : "platforms", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "The platforms that the user commonly works with", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataPlatform" ], + "name" : "IsUserOf" + } + } + }, { + "name" : "persona", + "type" : "com.linkedin.common.Urn", + "doc" : "The user's persona type, based on their role", + "optional" : true, + "Relationship" : { + "entityTypes" : [ "dataHubPersona" ], + "name" : "IsPersona" + } }, { "name" : "slack", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 8f4c871405e245..15f16dd2ea6cd0 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -2760,6 +2760,29 @@ "type" : "string", "doc" : "DataHub-native Title, e.g. 'Software Engineer'", "optional" : true + }, { + "name" : "platforms", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "The platforms that the user commonly works with", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataPlatform" ], + "name" : "IsUserOf" + } + } + }, { + "name" : "persona", + "type" : "com.linkedin.common.Urn", + "doc" : "The user's persona type, based on their role", + "optional" : true, + "Relationship" : { + "entityTypes" : [ "dataHubPersona" ], + "name" : "IsPersona" + } }, { "name" : "slack", "type" : "string", From b4e05050d168e7326c8cc7fd549bc5c4a591c3ce Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Sat, 29 Jun 2024 02:50:10 -0500 Subject: [PATCH 15/33] feat(logging): unified request logging (graphql, openapi, restli) (#10802) --- metadata-operation-context/build.gradle | 2 + .../metadata/context/RequestContext.java | 119 ++++++++++++++++-- .../datahub/graphql/GraphQLController.java | 16 ++- .../datahub/graphql/SpringQueryContext.java | 27 ++-- .../openapi-analytics-servlet/build.gradle | 2 +- .../delegates/DatahubUsageEventsImpl.java | 6 +- .../v2/delegates/EntityApiDelegateImpl.java | 33 +++-- .../JavaSpring/apiController.mustache | 2 +- .../OpenAPIEntityTestConfiguration.java | 8 +- metadata-service/openapi-servlet/build.gradle | 1 + .../controller/GenericEntitiesController.java | 54 ++++++-- .../controller/LineageApiImpl.java | 7 +- .../elastic/OperationsController.java | 23 +++- .../v1/entities/EntitiesController.java | 10 ++ .../v2/controller/EntityController.java | 12 +- .../PlatformEntitiesController.java | 4 + .../v2/controller/TimeseriesController.java | 6 +- .../java/entities/EntitiesControllerTest.java | 2 +- .../resources/analytics/Analytics.java | 5 +- .../resources/entity/AspectResource.java | 10 +- .../entity/BatchIngestionRunResource.java | 4 +- .../resources/entity/EntityResource.java | 46 +++---- .../resources/entity/EntityV2Resource.java | 4 +- .../entity/EntityVersionedV2Resource.java | 2 +- .../operations/OperationsResource.java | 6 +- .../metadata/resources/operations/Utils.java | 6 +- .../metadata/resources/usage/UsageStats.java | 6 +- 27 files changed, 320 insertions(+), 103 deletions(-) diff --git a/metadata-operation-context/build.gradle b/metadata-operation-context/build.gradle index 1be98cb0140f3a..650082ef0d25e6 100644 --- a/metadata-operation-context/build.gradle +++ b/metadata-operation-context/build.gradle @@ -7,6 +7,8 @@ dependencies { api project(':metadata-auth:auth-api') implementation externalDependency.slf4jApi + implementation externalDependency.servletApi + implementation spec.product.pegasus.restliServer compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java index 83090b92357012..dcea185fcbc7ca 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java @@ -1,5 +1,8 @@ package io.datahubproject.metadata.context; +import com.google.common.net.HttpHeaders; +import com.linkedin.restli.server.ResourceContext; +import jakarta.servlet.http.HttpServletRequest; import java.util.Arrays; import java.util.Collection; import java.util.List; @@ -11,7 +14,9 @@ import javax.annotation.Nullable; import lombok.Builder; import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +@Slf4j @Getter @Builder public class RequestContext implements ContextInterface { @@ -19,6 +24,8 @@ public class RequestContext implements ContextInterface { public static final RequestContext TEST = RequestContext.builder().requestID("test").requestAPI(RequestAPI.TEST).build(); + @Nonnull private final String actorUrn; + @Nonnull private final String sourceIP; @Nonnull private final RequestAPI requestAPI; /** @@ -27,6 +34,23 @@ public class RequestContext implements ContextInterface { */ @Nonnull private final String requestID; + @Nonnull private final String userAgent; + + public RequestContext( + @Nonnull String actorUrn, + @Nonnull String sourceIP, + @Nonnull RequestAPI requestAPI, + @Nonnull String requestID, + @Nonnull String userAgent) { + this.actorUrn = actorUrn; + this.sourceIP = sourceIP; + this.requestAPI = requestAPI; + this.requestID = requestID; + this.userAgent = userAgent; + // Uniform common logging of requests across APIs + log.info(toString()); + } + @Override public Optional getCacheKeyComponent() { return Optional.empty(); @@ -34,39 +58,76 @@ public Optional getCacheKeyComponent() { public static class RequestContextBuilder { private RequestContext build() { - return new RequestContext(this.requestAPI, this.requestID); + return new RequestContext( + this.actorUrn, this.sourceIP, this.requestAPI, this.requestID, this.userAgent); } - public RequestContext buildGraphql(@Nonnull String queryName, Map variables) { + public RequestContext buildGraphql( + @Nonnull String actorUrn, + @Nonnull HttpServletRequest request, + @Nonnull String queryName, + Map variables) { + actorUrn(actorUrn); + sourceIP(extractSourceIP(request)); requestAPI(RequestAPI.GRAPHQL); requestID(buildRequestId(queryName, Set.of())); + userAgent(extractUserAgent(request)); return build(); } - public RequestContext buildRestli(String action, @Nullable String entityName) { - return buildRestli(action, entityName == null ? null : List.of(entityName)); + public RequestContext buildRestli( + @Nonnull String actorUrn, + @Nullable ResourceContext resourceContext, + String action, + @Nullable String entityName) { + return buildRestli( + actorUrn, resourceContext, action, entityName == null ? null : List.of(entityName)); } - public RequestContext buildRestli(@Nonnull String action, @Nullable String[] entityNames) { + public RequestContext buildRestli( + @Nonnull String actorUrn, + @Nullable ResourceContext resourceContext, + @Nonnull String action, + @Nullable String[] entityNames) { return buildRestli( + actorUrn, + resourceContext, action, entityNames == null ? null : Arrays.stream(entityNames).collect(Collectors.toList())); } - public RequestContext buildRestli(String action, @Nullable Collection entityNames) { + public RequestContext buildRestli( + @Nonnull String actorUrn, + @Nullable ResourceContext resourceContext, + String action, + @Nullable Collection entityNames) { + actorUrn(actorUrn); + sourceIP(resourceContext == null ? "" : extractSourceIP(resourceContext)); requestAPI(RequestAPI.RESTLI); requestID(buildRequestId(action, entityNames)); + userAgent(resourceContext == null ? "" : extractUserAgent(resourceContext)); return build(); } - public RequestContext buildOpenapi(@Nonnull String action, @Nullable String entityName) { - return buildOpenapi(action, entityName == null ? null : List.of(entityName)); + public RequestContext buildOpenapi( + @Nonnull String actorUrn, + @Nonnull HttpServletRequest request, + @Nonnull String action, + @Nullable String entityName) { + return buildOpenapi( + actorUrn, request, action, entityName == null ? null : List.of(entityName)); } public RequestContext buildOpenapi( - @Nonnull String action, @Nullable Collection entityNames) { + @Nonnull String actorUrn, + @Nullable HttpServletRequest request, + @Nonnull String action, + @Nullable Collection entityNames) { + actorUrn(actorUrn); + sourceIP(request == null ? "" : extractSourceIP(request)); requestAPI(RequestAPI.OPENAPI); requestID(buildRequestId(action, entityNames)); + userAgent(request == null ? "" : extractUserAgent(request)); return build(); } @@ -77,6 +138,46 @@ private static String buildRequestId( : String.format( "%s(%s)", action, entityNames.stream().distinct().collect(Collectors.toList())); } + + private static String extractUserAgent(@Nonnull HttpServletRequest request) { + return Optional.ofNullable(request.getHeader(HttpHeaders.USER_AGENT)).orElse(""); + } + + private static String extractUserAgent(@Nonnull ResourceContext resourceContext) { + return Optional.ofNullable(resourceContext.getRequestHeaders().get(HttpHeaders.USER_AGENT)) + .orElse(""); + } + + private static String extractSourceIP(@Nonnull HttpServletRequest request) { + return Optional.ofNullable(request.getHeader(HttpHeaders.X_FORWARDED_FOR)) + .orElse(request.getRemoteAddr()); + } + + private static String extractSourceIP(@Nonnull ResourceContext resourceContext) { + return Optional.ofNullable( + resourceContext.getRequestHeaders().get(HttpHeaders.X_FORWARDED_FOR)) + .orElse(resourceContext.getRawRequestContext().getLocalAttr("REMOTE_ADDR").toString()); + } + } + + @Override + public String toString() { + return "RequestContext{" + + "actorUrn='" + + actorUrn + + '\'' + + ", sourceIP='" + + sourceIP + + '\'' + + ", requestAPI=" + + requestAPI + + ", requestID='" + + requestID + + '\'' + + ", userAgent='" + + userAgent + + '\'' + + '}'; } public enum RequestAPI { diff --git a/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/GraphQLController.java b/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/GraphQLController.java index af69dce89041e6..2f383b1956313d 100644 --- a/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/GraphQLController.java +++ b/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/GraphQLController.java @@ -59,7 +59,8 @@ public GraphQLController() { private static final int MAX_LOG_WIDTH = 512; @PostMapping(value = "/graphql", produces = "application/json;charset=utf-8") - CompletableFuture> postGraphQL(HttpEntity httpEntity) { + CompletableFuture> postGraphQL( + HttpServletRequest request, HttpEntity httpEntity) { String jsonStr = httpEntity.getBody(); ObjectMapper mapper = new ObjectMapper(); @@ -117,13 +118,18 @@ CompletableFuture> postGraphQL(HttpEntity httpEnt SpringQueryContext context = new SpringQueryContext( - true, authentication, _authorizerChain, systemOperationContext, query, variables); + true, + authentication, + _authorizerChain, + systemOperationContext, + request, + operationName, + query, + variables); Span.current().setAttribute("actor.urn", context.getActorUrn()); - // operationName is an optional field only required if multiple operations are present - final String queryName = operationName != null ? operationName : context.getQueryName(); final String threadName = Thread.currentThread().getName(); - log.info("Processing request, operation: {}, actor urn: {}", queryName, context.getActorUrn()); + final String queryName = context.getQueryName(); log.debug("Query: {}, variables: {}", query, variables); return GraphQLConcurrencyUtils.supplyAsync( diff --git a/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/SpringQueryContext.java b/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/SpringQueryContext.java index 591e1158879d44..8aa22efb785e25 100644 --- a/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/SpringQueryContext.java +++ b/metadata-service/graphql-servlet-impl/src/main/java/com/datahub/graphql/SpringQueryContext.java @@ -7,8 +7,10 @@ import graphql.parser.Parser; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; +import jakarta.servlet.http.HttpServletRequest; import java.util.Map; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import lombok.Getter; @Getter @@ -25,26 +27,33 @@ public SpringQueryContext( final Authentication authentication, final Authorizer authorizer, @Nonnull final OperationContext systemOperationContext, + @Nonnull final HttpServletRequest request, + @Nullable final String operationName, String jsonQuery, Map variables) { this.isAuthenticated = isAuthenticated; this.authentication = authentication; this.authorizer = authorizer; + // operationName is an optional field only required if multiple operations are present this.queryName = - new Parser() - .parseDocument(jsonQuery).getDefinitions().stream() - .filter(def -> def instanceof OperationDefinition) - .map(def -> (OperationDefinition) def) - .filter(opDef -> opDef.getOperation().equals(OperationDefinition.Operation.QUERY)) - .findFirst() - .map(OperationDefinition::getName) - .orElse("graphql"); + operationName != null + ? operationName + : new Parser() + .parseDocument(jsonQuery).getDefinitions().stream() + .filter(def -> def instanceof OperationDefinition) + .map(def -> (OperationDefinition) def) + .filter( + opDef -> opDef.getOperation().equals(OperationDefinition.Operation.QUERY)) + .findFirst() + .map(OperationDefinition::getName) + .orElse("graphql"); this.operationContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildGraphql(queryName, variables), + RequestContext.builder() + .buildGraphql(authentication.getActor().toUrnStr(), request, queryName, variables), authorizer, authentication, true); diff --git a/metadata-service/openapi-analytics-servlet/build.gradle b/metadata-service/openapi-analytics-servlet/build.gradle index 3a879cb1b0071a..5d1372a293775b 100644 --- a/metadata-service/openapi-analytics-servlet/build.gradle +++ b/metadata-service/openapi-analytics-servlet/build.gradle @@ -19,7 +19,7 @@ dependencies { implementation externalDependency.springWebMVC implementation externalDependency.springBeans implementation externalDependency.springContext - + implementation externalDependency.servletApi implementation externalDependency.reflections implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-service/openapi-analytics-servlet/src/main/java/io/datahubproject/openapi/delegates/DatahubUsageEventsImpl.java b/metadata-service/openapi-analytics-servlet/src/main/java/io/datahubproject/openapi/delegates/DatahubUsageEventsImpl.java index 53df1e50ffbca7..dc6d4a33f936ea 100644 --- a/metadata-service/openapi-analytics-servlet/src/main/java/io/datahubproject/openapi/delegates/DatahubUsageEventsImpl.java +++ b/metadata-service/openapi-analytics-servlet/src/main/java/io/datahubproject/openapi/delegates/DatahubUsageEventsImpl.java @@ -12,6 +12,7 @@ import io.datahubproject.metadata.context.RequestContext; import io.datahubproject.openapi.exception.UnauthorizedException; import io.datahubproject.openapi.v2.generated.controller.DatahubUsageEventsApiDelegate; +import jakarta.servlet.http.HttpServletRequest; import java.util.List; import java.util.Objects; import org.springframework.beans.factory.annotation.Autowired; @@ -27,6 +28,8 @@ public class DatahubUsageEventsImpl implements DatahubUsageEventsApiDelegate { @Qualifier("systemOperationContext") OperationContext systemOperationContext; + @Autowired private HttpServletRequest request; + public static final String DATAHUB_USAGE_INDEX = "datahub_usage_event"; @Override @@ -36,7 +39,8 @@ public ResponseEntity raw(String body) { OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("raw", List.of()), + RequestContext.builder() + .buildOpenapi(authentication.getActor().toUrnStr(), request, "raw", List.of()), _authorizationChain, authentication, true); diff --git a/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/v2/delegates/EntityApiDelegateImpl.java b/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/v2/delegates/EntityApiDelegateImpl.java index bd4d68834b9e87..12caba3e39dd68 100644 --- a/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/v2/delegates/EntityApiDelegateImpl.java +++ b/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/v2/delegates/EntityApiDelegateImpl.java @@ -59,6 +59,7 @@ import io.datahubproject.openapi.generated.StatusAspectResponseV2; import io.datahubproject.openapi.util.OpenApiEntitiesUtil; import io.datahubproject.openapi.v1.entities.EntitiesController; +import jakarta.servlet.http.HttpServletRequest; import java.net.URISyntaxException; import java.util.List; import java.util.Map; @@ -85,6 +86,7 @@ public class EntityApiDelegateImpl { private final Class _reqClazz; private final Class _respClazz; private final Class _scrollRespClazz; + private final HttpServletRequest request; private static final String BUSINESS_ATTRIBUTE_ERROR_MESSAGE = "business attribute is disabled, enable it using featureflag : BUSINESS_ATTRIBUTE_ENTITY_ENABLED"; @@ -92,6 +94,7 @@ public class EntityApiDelegateImpl { public EntityApiDelegateImpl( OperationContext systemOperationContext, + HttpServletRequest request, EntityService entityService, SearchService searchService, EntitiesController entitiesController, @@ -100,6 +103,7 @@ public EntityApiDelegateImpl( Class respClazz, Class scrollRespClazz) { this.systemOperationContext = systemOperationContext; + this.request = request; this._entityService = entityService; this._searchService = searchService; this._entityRegistry = systemOperationContext.getEntityRegistry(); @@ -119,7 +123,7 @@ public ResponseEntity get(String urn, Boolean systemMetadata, List as .map(asp -> asp.stream().distinct().toArray(String[]::new)) .orElse(null); ResponseEntity result = - _v1Controller.getEntities(new String[] {urn}, requestedAspects); + _v1Controller.getEntities(request, new String[] {urn}, requestedAspects); return ResponseEntity.of( OpenApiEntitiesUtil.convertEntity( Optional.ofNullable(result).map(HttpEntity::getBody).orElse(null), @@ -146,7 +150,7 @@ public ResponseEntity> create( throw new UnsupportedOperationException(BUSINESS_ATTRIBUTE_ERROR_MESSAGE); } } - _v1Controller.postEntities(aspects, false, createIfNotExists, createEntityIfNotExists); + _v1Controller.postEntities(request, aspects, false, createIfNotExists, createEntityIfNotExists); List responses = body.stream() .map(req -> OpenApiEntitiesUtil.convertToResponse(req, _respClazz, _entityRegistry)) @@ -158,7 +162,7 @@ public ResponseEntity delete(String urn) { if (checkBusinessAttributeFlagFromUrn(urn)) { throw new UnsupportedOperationException(BUSINESS_ATTRIBUTE_ERROR_MESSAGE); } - _v1Controller.deleteEntities(new String[] {urn}, false, false); + _v1Controller.deleteEntities(request, new String[] {urn}, false, false); return new ResponseEntity<>(HttpStatus.OK); } @@ -177,7 +181,9 @@ public ResponseEntity head(String urn) { OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("head", entityUrn.getEntityType()), + RequestContext.builder() + .buildOpenapi( + auth.getActor().toUrnStr(), request, "head", entityUrn.getEntityType()), _authorizationChain, auth, true); @@ -200,7 +206,7 @@ public ResponseEntity getAspect( Class aspectRespClazz) { String[] requestedAspects = new String[] {aspect}; ResponseEntity result = - _v1Controller.getEntities(new String[] {urn}, requestedAspects); + _v1Controller.getEntities(request, new String[] {urn}, requestedAspects); return ResponseEntity.of( OpenApiEntitiesUtil.convertAspect( result.getBody(), aspect, entityRespClass, aspectRespClazz, systemMetadata)); @@ -217,6 +223,7 @@ public ResponseEntity createAspect( UpsertAspectRequest aspectUpsert = OpenApiEntitiesUtil.convertAspectToUpsert(urn, body, reqClazz); _v1Controller.postEntities( + request, Stream.of(aspectUpsert).filter(Objects::nonNull).collect(Collectors.toList()), false, createIfNotExists, @@ -238,7 +245,9 @@ public ResponseEntity headAspect(String urn, String aspect) { OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("headAspect", entityUrn.getEntityType()), + RequestContext.builder() + .buildOpenapi( + auth.getActor().toUrnStr(), request, "headAspect", entityUrn.getEntityType()), _authorizationChain, auth, true); @@ -259,12 +268,14 @@ public ResponseEntity deleteAspect(String urn, String aspect) { OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("deleteAspect", entityUrn.getEntityType()), + RequestContext.builder() + .buildOpenapi( + auth.getActor().toUrnStr(), request, "deleteAspect", entityUrn.getEntityType()), _authorizationChain, auth, true); _entityService.deleteAspect(opContext, urn, aspect, Map.of(), false); - _v1Controller.deleteEntities(new String[] {urn}, false, false); + _v1Controller.deleteEntities(request, new String[] {urn}, false, false); return new ResponseEntity<>(HttpStatus.OK); } @@ -606,7 +617,9 @@ public ResponseEntity scroll( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("scroll", entitySpec.getName()), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "scroll", entitySpec.getName()), _authorizationChain, authentication, true); @@ -644,7 +657,7 @@ public ResponseEntity scroll( .map(asp -> asp.stream().distinct().toArray(String[]::new)) .orElse(null); List entities = - Optional.ofNullable(_v1Controller.getEntities(urns, requestedAspects).getBody()) + Optional.ofNullable(_v1Controller.getEntities(request, urns, requestedAspects).getBody()) .map(body -> body.getResponses().entrySet()) .map( entries -> OpenApiEntitiesUtil.convertEntities(entries, _respClazz, systemMetadata)) diff --git a/metadata-service/openapi-entity-servlet/src/main/resources/JavaSpring/apiController.mustache b/metadata-service/openapi-entity-servlet/src/main/resources/JavaSpring/apiController.mustache index 04bc7e52c593b4..fbf354ff91688f 100644 --- a/metadata-service/openapi-entity-servlet/src/main/resources/JavaSpring/apiController.mustache +++ b/metadata-service/openapi-entity-servlet/src/main/resources/JavaSpring/apiController.mustache @@ -98,7 +98,7 @@ public class {{classname}}Controller implements {{classname}} { SearchService searchService, EntitiesController v1Controller, AuthorizerChain authorizationChain) { this.objectMapper = objectMapper; this.request = request; - this.delegate = new EntityApiDelegateImpl<{{requestClass}}, {{responseClass}}, {{scrollResponseClass}}>(systemOperationContext, entityService, searchService, v1Controller, + this.delegate = new EntityApiDelegateImpl<{{requestClass}}, {{responseClass}}, {{scrollResponseClass}}>(systemOperationContext, request, entityService, searchService, v1Controller, authorizationChain, {{requestClass}}.class, {{responseClass}}.class, {{scrollResponseClass}}.class); } {{#isJava8or11}} diff --git a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java index c1398e86167477..075501c1a10711 100644 --- a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java +++ b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java @@ -3,6 +3,7 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.nullable; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -33,6 +34,7 @@ import io.datahubproject.openapi.v1.relationships.RelationshipsController; import io.datahubproject.openapi.v2.controller.TimelineControllerV2; import io.datahubproject.test.metadata.context.TestOperationContexts; +import jakarta.servlet.http.HttpServletRequest; import java.util.Arrays; import java.util.Map; import java.util.stream.Collectors; @@ -109,11 +111,11 @@ public EntityRegistry entityRegistry() throws EntityRegistryException, Interrupt @Primary public EntitiesController entitiesController() { EntitiesController entitiesController = mock(EntitiesController.class); - when(entitiesController.getEntities(any(), any())) + when(entitiesController.getEntities(nullable(HttpServletRequest.class), any(), any())) .thenAnswer( params -> { - String[] urns = params.getArgument(0); - String[] aspects = params.getArgument(1); + String[] urns = params.getArgument(1); + String[] aspects = params.getArgument(2); return ResponseEntity.ok( UrnResponseMap.builder() .responses( diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle index 7b6a3c7c89e31c..e26b1ceea1a3c6 100644 --- a/metadata-service/openapi-servlet/build.gradle +++ b/metadata-service/openapi-servlet/build.gradle @@ -21,6 +21,7 @@ dependencies { implementation externalDependency.springWebMVC implementation externalDependency.springBeans implementation externalDependency.springContext + implementation externalDependency.servletApi implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 6869f7bf58235c..30988d81db2f9a 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -52,6 +52,7 @@ import io.datahubproject.openapi.models.GenericEntityScrollResult; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; import java.lang.reflect.InvocationTargetException; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; @@ -133,6 +134,7 @@ protected abstract AspectsBatch toMCPBatch( @GetMapping(value = "/{entityName}", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Scroll entities") public ResponseEntity getEntities( + HttpServletRequest request, @PathVariable("entityName") String entityName, @RequestParam(value = "aspectNames", defaultValue = "") Set aspects1, @RequestParam(value = "aspects", defaultValue = "") Set aspects2, @@ -159,7 +161,9 @@ public ResponseEntity getEntities( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("getEntities", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "getEntities", entityName), authorizationChain, authentication, true); @@ -199,6 +203,7 @@ public ResponseEntity getEntities( value = "/{entityName}/{entityUrn:urn:li:.+}", produces = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity getEntity( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @RequestParam(value = "aspectNames", defaultValue = "") Set aspects1, @@ -217,7 +222,9 @@ public ResponseEntity getEntity( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("getEntity", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "getEntity", entityName), authorizationChain, authentication, true); @@ -239,6 +246,7 @@ public ResponseEntity getEntity( method = {RequestMethod.HEAD}) @Operation(summary = "Entity exists") public ResponseEntity headEntity( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @PathVariable(value = "includeSoftDelete", required = false) Boolean includeSoftDelete) @@ -254,7 +262,9 @@ public ResponseEntity headEntity( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("headEntity", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "headEntity", entityName), authorizationChain, authentication, true); @@ -270,6 +280,7 @@ public ResponseEntity headEntity( produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Get an entity's generic aspect.") public ResponseEntity getAspect( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @PathVariable("aspectName") String aspectName, @@ -287,7 +298,9 @@ public ResponseEntity getAspect( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("getAspect", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "getAspect", entityName), authorizationChain, authentication, true); @@ -311,6 +324,7 @@ public ResponseEntity getAspect( method = {RequestMethod.HEAD}) @Operation(summary = "Whether an entity aspect exists.") public ResponseEntity headAspect( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @PathVariable("aspectName") String aspectName, @@ -327,7 +341,9 @@ public ResponseEntity headAspect( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("headAspect", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "headAspect", entityName), authorizationChain, authentication, true); @@ -341,7 +357,9 @@ public ResponseEntity headAspect( @DeleteMapping(value = "/{entityName}/{entityUrn:urn:li:.+}") @Operation(summary = "Delete an entity") public void deleteEntity( - @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn) + HttpServletRequest request, + @PathVariable("entityName") String entityName, + @PathVariable("entityUrn") String entityUrn) throws InvalidUrnException { EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); @@ -355,7 +373,9 @@ public void deleteEntity( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("deleteEntity", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "deleteEntity", entityName), authorizationChain, authentication, true); @@ -367,6 +387,7 @@ public void deleteEntity( @PostMapping(value = "/{entityName}", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Create a batch of entities.") public ResponseEntity> createEntity( + HttpServletRequest request, @PathVariable("entityName") String entityName, @RequestParam(value = "async", required = false, defaultValue = "true") Boolean async, @RequestParam(value = "systemMetadata", required = false, defaultValue = "false") @@ -385,7 +406,9 @@ public ResponseEntity> createEntity( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("createEntity", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "createEntity", entityName), authorizationChain, authentication, true); @@ -404,6 +427,7 @@ public ResponseEntity> createEntity( @DeleteMapping(value = "/{entityName}/{entityUrn:urn:li:.+}/{aspectName}") @Operation(summary = "Delete an entity aspect.") public void deleteAspect( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @PathVariable("aspectName") String aspectName) @@ -419,7 +443,9 @@ public void deleteAspect( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("deleteAspect", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "deleteAspect", entityName), authorizationChain, authentication, true); @@ -434,6 +460,7 @@ public void deleteAspect( produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Create an entity aspect.") public ResponseEntity createAspect( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @PathVariable("aspectName") String aspectName, @@ -456,7 +483,9 @@ public ResponseEntity createAspect( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("createAspect", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "createAspect", entityName), authorizationChain, authentication, true); @@ -494,6 +523,7 @@ public ResponseEntity createAspect( produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Patch an entity aspect. (Experimental)") public ResponseEntity patchAspect( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("entityUrn") String entityUrn, @PathVariable("aspectName") String aspectName, @@ -517,7 +547,9 @@ public ResponseEntity patchAspect( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("patchAspect", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "patchAspect", entityName), authorizationChain, authentication, true); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java index f483d6ded1f10f..f1da20fefe6bff 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java @@ -14,6 +14,7 @@ import io.datahubproject.openlineage.generated.controller.LineageApi; import io.openlineage.client.OpenLineage; import io.openlineage.client.OpenLineageClientUtils; +import jakarta.servlet.http.HttpServletRequest; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -50,6 +51,8 @@ public Optional getObjectMapper() { return Optional.of(OBJECT_MAPPER); } + @Autowired private HttpServletRequest request; + @Override public ResponseEntity postRunEventRaw(String body) { try { @@ -68,7 +71,9 @@ public ResponseEntity postRunEventRaw(OpenLineage.RunEvent openlineageRunE OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("postRunEventRaw", List.of()), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "postRunEventRaw", List.of()), _authorizerChain, authentication, true); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java index 341f2a45197d18..ddbc8004081eb6 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java @@ -31,6 +31,7 @@ import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Parameter; import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; import java.net.URISyntaxException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; @@ -133,7 +134,7 @@ public ResponseEntity getTaskStatus(String task) { @Tag(name = "ElasticSearchOperations") @GetMapping(path = "/getIndexSizes", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Get Index Sizes") - public ResponseEntity getIndexSizes() { + public ResponseEntity getIndexSizes(HttpServletRequest request) { Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); @@ -145,7 +146,7 @@ public ResponseEntity getIndexSizes() { OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("getIndexSizes", List.of()), + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getIndexSizes", List.of()), authorizerChain, authentication, true); @@ -171,6 +172,7 @@ public ResponseEntity getIndexSizes() { @GetMapping(path = "/explainSearchQuery", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Explain Search Query") public ResponseEntity explainSearchQuery( + HttpServletRequest request, @Parameter( name = "query", required = true, @@ -229,7 +231,8 @@ public ResponseEntity explainSearchQuery( OperationContext opContext = systemOperationContext .asSession( - RequestContext.builder().buildOpenapi("explainSearchQuery", entityName), + RequestContext.builder() + .buildOpenapi(actorUrnStr, request, "explainSearchQuery", entityName), authorizerChain, authentication) .withSearchFlags( @@ -263,6 +266,7 @@ public ResponseEntity explainSearchQuery( @GetMapping(path = "/explainSearchQueryDiff", produces = MediaType.TEXT_PLAIN_VALUE) @Operation(summary = "Explain the differences in scoring for 2 documents") public ResponseEntity explainSearchQueryDiff( + HttpServletRequest request, @Parameter( name = "query", required = true, @@ -328,7 +332,8 @@ public ResponseEntity explainSearchQueryDiff( OperationContext opContext = systemOperationContext .asSession( - RequestContext.builder().buildOpenapi("explainSearchQuery", entityName), + RequestContext.builder() + .buildOpenapi(actorUrnStr, request, "explainSearchQuery", entityName), authorizerChain, authentication) .withSearchFlags( @@ -400,6 +405,7 @@ private static String encodeValue(String value) { @GetMapping(path = "/restoreIndices", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Restore ElasticSearch indices from primary storage based on URNs.") public ResponseEntity> restoreIndices( + HttpServletRequest request, @RequestParam(required = false, name = "aspectName") @Nullable String aspectName, @RequestParam(required = false, name = "urn") @Nullable String urn, @RequestParam(required = false, name = "urnLike") @Nullable String urnLike, @@ -419,7 +425,9 @@ public ResponseEntity> restoreIndices( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("restoreIndices", List.of()), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "restoreIndices", List.of()), authorizerChain, authentication, true); @@ -445,6 +453,7 @@ public ResponseEntity> restoreIndices( @PostMapping(path = "/restoreIndices", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Restore ElasticSearch indices from primary storage based on URNs.") public ResponseEntity> restoreIndices( + HttpServletRequest request, @RequestParam(required = false, name = "aspectNames") @Nullable Set aspectNames, @RequestParam(required = false, name = "batchSize", defaultValue = "100") @Nullable Integer batchSize, @@ -459,7 +468,9 @@ public ResponseEntity> restoreIndices( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("restoreIndices", List.of()), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "restoreIndices", List.of()), authorizerChain, authentication, true); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java index 8632cedbf8ffb1..99eede15629d20 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java @@ -30,6 +30,7 @@ import io.datahubproject.openapi.util.MappingUtil; import io.swagger.v3.oas.annotations.Parameter; import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; import java.net.URLDecoder; import java.util.Arrays; import java.util.Collections; @@ -90,6 +91,7 @@ public void initBinder(WebDataBinder binder) { @GetMapping(value = "/latest", produces = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity getEntities( + HttpServletRequest request, @Parameter( name = "urns", required = true, @@ -122,6 +124,8 @@ public ResponseEntity getEntities( systemOperationContext, RequestContext.builder() .buildOpenapi( + actorUrnStr, + request, "getEntities", entityUrns.stream() .map(Urn::getEntityType) @@ -169,6 +173,7 @@ public ResponseEntity getEntities( @PostMapping(value = "/", produces = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity> postEntities( + HttpServletRequest request, @RequestBody @Nonnull List aspectRequests, @RequestParam(required = false, name = "async") Boolean async, @RequestParam(required = false, name = "createIfNotExists") Boolean createIfNotExists, @@ -191,6 +196,8 @@ public ResponseEntity> postEntities( systemOperationContext, RequestContext.builder() .buildOpenapi( + actorUrnStr, + request, "postEntities", proposals.stream() .map(MetadataChangeProposal::getEntityType) @@ -241,6 +248,7 @@ public ResponseEntity> postEntities( @DeleteMapping(value = "/", produces = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity> deleteEntities( + HttpServletRequest request, @Parameter( name = "urns", required = true, @@ -278,6 +286,8 @@ public ResponseEntity> deleteEntities( systemOperationContext, RequestContext.builder() .buildOpenapi( + actorUrnStr, + request, "deleteEntities", entityUrns.stream().map(Urn::getEntityType).collect(Collectors.toSet())), _authorizerChain, diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java index eeba41f9f819f5..b40aba2d2908fc 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java @@ -38,6 +38,7 @@ import io.datahubproject.openapi.v2.models.GenericEntityV2; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -87,7 +88,9 @@ public GenericEntityScrollResultV2 buildScrollResult( @PostMapping(value = "/batch/{entityName}", produces = MediaType.APPLICATION_JSON_VALUE) @Operation(summary = "Get a batch of entities") public ResponseEntity getEntityBatch( - @PathVariable("entityName") String entityName, @RequestBody BatchGetUrnRequest request) + HttpServletRequest httpServletRequest, + @PathVariable("entityName") String entityName, + @RequestBody BatchGetUrnRequest request) throws URISyntaxException { List urns = request.getUrns().stream().map(UrnUtils::getUrn).collect(Collectors.toList()); @@ -100,7 +103,12 @@ public ResponseEntity getEntityBatch( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("getEntityBatch", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), + httpServletRequest, + "getEntityBatch", + entityName), authorizationChain, authentication, true); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/PlatformEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/PlatformEntitiesController.java index 1d0de44c0d83a9..87c72064ad7a77 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/PlatformEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/PlatformEntitiesController.java @@ -17,6 +17,7 @@ import io.datahubproject.openapi.generated.MetadataChangeProposal; import io.datahubproject.openapi.util.MappingUtil; import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -58,6 +59,7 @@ public void initBinder(WebDataBinder binder) { @PostMapping(value = "/", produces = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity> postEntities( + HttpServletRequest request, @RequestBody @Nonnull List metadataChangeProposals, @RequestParam(required = false, name = "async") Boolean async) { log.info("INGEST PROPOSAL proposal: {}", metadataChangeProposals); @@ -69,6 +71,8 @@ public ResponseEntity> postEntities( systemOperationContext, RequestContext.builder() .buildOpenapi( + actorUrnStr, + request, "postEntities", metadataChangeProposals.stream() .map(MetadataChangeProposal::getEntityType) diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/TimeseriesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/TimeseriesController.java index 1c404006d97a46..bb10719bacd3fa 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/TimeseriesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/TimeseriesController.java @@ -22,6 +22,7 @@ import io.datahubproject.openapi.models.GenericScrollResult; import io.datahubproject.openapi.v2.models.GenericTimeseriesAspect; import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; import java.net.URISyntaxException; import java.util.List; import java.util.stream.Collectors; @@ -56,6 +57,7 @@ public class TimeseriesController { @GetMapping(value = "/{entityName}/{aspectName}", produces = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity> getAspects( + HttpServletRequest request, @PathVariable("entityName") String entityName, @PathVariable("aspectName") String aspectName, @RequestParam(value = "count", defaultValue = "10") Integer count, @@ -76,7 +78,9 @@ public ResponseEntity> getAspects( OperationContext opContext = OperationContext.asSession( systemOperationContext, - RequestContext.builder().buildOpenapi("getAspects", entityName), + RequestContext.builder() + .buildOpenapi( + authentication.getActor().toUrnStr(), request, "getAspects", entityName), authorizationChain, authentication, true); diff --git a/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java b/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java index 8e70bd507999fa..3e352403c88bca 100644 --- a/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java +++ b/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java @@ -214,7 +214,7 @@ public void testIngestDataset() { .build(); datasetAspects.add(glossaryTerms); - _entitiesController.postEntities(datasetAspects, false, false, false); + _entitiesController.postEntities(null, datasetAspects, false, false, false); } // @Test diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java index 7fcaf82e090966..753dd9b807fd12 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java @@ -14,6 +14,7 @@ import com.linkedin.restli.server.RestLiServiceException; import com.linkedin.restli.server.annotations.Action; import com.linkedin.restli.server.annotations.ActionParam; +import com.linkedin.restli.server.annotations.Context; import com.linkedin.restli.server.annotations.Optional; import com.linkedin.restli.server.annotations.RestLiSimpleResource; import com.linkedin.restli.server.resources.SimpleResourceTemplate; @@ -28,6 +29,7 @@ import javax.annotation.Nullable; import javax.inject.Inject; import javax.inject.Named; +import javax.servlet.http.HttpServletRequest; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; @@ -80,7 +82,8 @@ public Task getTimeseriesStats( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity " + entityName); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_GET_TIMESERIES_STATS, entityName), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), + ACTION_GET_TIMESERIES_STATS, entityName), authorizer, auth, true); log.info("Attempting to query timeseries stats"); GetTimeseriesAggregatedStatsResponse resp = new GetTimeseriesAggregatedStatsResponse(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 8dc73e45846edc..0d9a49d583b57a 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -148,7 +148,7 @@ public Task get( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get aspect for " + urn); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("authorizerChain", urn.getEntityType()), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "authorizerChain", urn.getEntityType()), _authorizer, auth, true); final VersionedAspect aspect = _entityService.getVersionedAspect(opContext, urn, aspectName, version); @@ -199,7 +199,7 @@ public Task getTimeseriesAspectValues( "User is unauthorized to get timeseries aspect for " + urn); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_GET_TIMESERIES_ASPECT, urn.getEntityType()), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_GET_TIMESERIES_ASPECT, urn.getEntityType()), _authorizer, auth, true); GetTimeseriesAspectValuesResponse response = new GetTimeseriesAspectValuesResponse(); response.setEntityName(entityName); @@ -280,7 +280,7 @@ private Task ingestProposals( .map(MetadataChangeProposal::getEntityType) .collect(Collectors.toSet()); final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_INGEST_PROPOSAL, entityTypes), _authorizer, authentication, true); + systemOperationContext, RequestContext.builder().buildRestli(authentication.getActor().toUrnStr(), getContext(), ACTION_INGEST_PROPOSAL, entityTypes), _authorizer, authentication, true); // Ingest Authorization Checks List> exceptions = isAPIAuthorized(authentication, _authorizer, ENTITY, @@ -344,7 +344,7 @@ public Task getCount( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get aspect counts."); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_GET_COUNT, List.of()), _authorizer, authentication, true); + systemOperationContext, RequestContext.builder().buildRestli(authentication.getActor().toUrnStr(), getContext(), ACTION_GET_COUNT, List.of()), _authorizer, authentication, true); return _entityService.getCountAspect(opContext, aspectName, urnLike); }, @@ -373,7 +373,7 @@ public Task restoreIndices( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to update entities."); } - return Utils.restoreIndices(systemOperationContext, + return Utils.restoreIndices(systemOperationContext, getContext(), aspectName, urn, urnLike, start, batchSize, limit, gePitEpochMs, lePitEpochMs, _authorizer, _entityService); }, MetricRegistry.name(this.getClass(), "restoreIndices")); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java index 970e18cb7bee07..599bbf9ce4df60 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java @@ -95,7 +95,7 @@ public Task rollback( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to update entity"); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("rollback", List.of()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "rollback", List.of()), authorizer, auth, true); log.info("ROLLBACK RUN runId: {} dry run: {}", runId, dryRun); @@ -171,7 +171,7 @@ public Task describe( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity"); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("describe", List.of()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "describe", List.of()), authorizer, auth, true); List summaries = systemMetadataService.findByRunId( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index 4ad668d0b1054d..e79dda34256822 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -200,7 +200,7 @@ public Task get( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity " + urn); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("restrictedService", urn.getEntityType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "restrictedService", urn.getEntityType()), authorizer, auth, true); return RestliUtil.toTask( () -> { @@ -240,7 +240,7 @@ public Task> batchGet( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entities: " + urnStrs); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("batchGet", urnStrs), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "batchGet", urnStrs), authorizer, auth, true); return RestliUtil.toTask( () -> { @@ -289,7 +289,7 @@ public Task ingest( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to edit entity " + urn); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_INGEST, urn.getEntityType()), authorizer, authentication, true); + systemOperationContext, RequestContext.builder().buildRestli(authentication.getActor().toUrnStr(), getContext(), ACTION_INGEST, urn.getEntityType()), authorizer, authentication, true); try { validateOrThrow(entity); @@ -334,7 +334,7 @@ public Task batchIngest( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to edit entities."); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_BATCH_INGEST, urns.stream() + systemOperationContext, RequestContext.builder().buildRestli(authentication.getActor().toUrnStr(), getContext(), ACTION_BATCH_INGEST, urns.stream() .map(Urn::getEntityType).collect(Collectors.toList())), authorizer, authentication, true); for (Entity entity : entities) { @@ -394,7 +394,7 @@ public Task search( } OperationContext opContext = OperationContext.asSession(systemOperationContext, - RequestContext.builder().buildRestli(ACTION_SEARCH, entityName), authorizer, auth, true) + RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_SEARCH, entityName), authorizer, auth, true) .withSearchFlags(flags -> searchFlags != null ? searchFlags : new SearchFlags().setFulltext(Boolean.TRUE.equals(fulltext))); log.info("GET SEARCH RESULTS for {} with query {}", entityName, input); @@ -434,7 +434,7 @@ public Task searchAcrossEntities( final Authentication auth = AuthenticationContext.getAuthentication(); OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_SEARCH_ACROSS_ENTITIES, entities), authorizer, auth, true) + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_SEARCH_ACROSS_ENTITIES, entities), authorizer, auth, true) .withSearchFlags(flags -> searchFlags != null ? searchFlags : new SearchFlags().setFulltext(true)); List entityList = searchService.getEntitiesToSearch(opContext, entities == null ? Collections.emptyList() : Arrays.asList(entities), count); @@ -478,7 +478,7 @@ public Task scrollAcrossEntities( final Authentication auth = AuthenticationContext.getAuthentication(); OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_SCROLL_ACROSS_ENTITIES, entities), authorizer, auth, true) + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_SCROLL_ACROSS_ENTITIES, entities), authorizer, auth, true) .withSearchFlags(flags -> searchFlags != null ? searchFlags : new SearchFlags().setFulltext(true)); List entityList = searchService.getEntitiesToSearch(opContext, entities == null ? Collections.emptyList() : Arrays.asList(entities), count); @@ -547,7 +547,7 @@ public Task searchAcrossLineage( } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_SEARCH_ACROSS_LINEAGE, entities), authorizer, auth, true) + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_SEARCH_ACROSS_LINEAGE, entities), authorizer, auth, true) .withSearchFlags(flags -> (searchFlags != null ? searchFlags : new SearchFlags().setFulltext(true)) .setIncludeRestricted(true)) .withLineageFlags(flags -> flags.setStartTimeMillis(startTimeMillis, SetMode.REMOVE_IF_NULL) @@ -606,7 +606,7 @@ public Task scrollAcrossLineage( } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_SCROLL_ACROSS_LINEAGE, entities), + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_SCROLL_ACROSS_LINEAGE, entities), authorizer, auth, true) .withSearchFlags(flags -> (searchFlags != null ? searchFlags : new SearchFlags().setSkipCache(true)) .setIncludeRestricted(true)) @@ -661,7 +661,7 @@ public Task list( } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_LIST, entityName), authorizer, auth, true) + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_LIST, entityName), authorizer, auth, true) .withSearchFlags(flags -> new SearchFlags().setFulltext(false)); log.info("GET LIST RESULTS for {} with filter {}", entityName, filter); @@ -702,7 +702,7 @@ public Task autocomplete( } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_AUTOCOMPLETE, entityName), authorizer, auth, true) + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_AUTOCOMPLETE, entityName), authorizer, auth, true) .withSearchFlags(flags -> searchFlags != null ? searchFlags : flags); return RestliUtil.toTask( @@ -740,7 +740,7 @@ public Task browse( } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_BROWSE, entityName), authorizer, auth, true) + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_BROWSE, entityName), authorizer, auth, true) .withSearchFlags(flags -> searchFlags != null ? searchFlags : flags); log.info("GET BROWSE RESULTS for {} at path {}", entityName, path); @@ -779,7 +779,7 @@ public Task getBrowsePaths( log.info("GET BROWSE PATHS for {}", urn); OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_GET_BROWSE_PATHS, urn.getEntityType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_GET_BROWSE_PATHS, urn.getEntityType()), authorizer, auth, true); return RestliUtil.toTask( () -> new StringArray(entitySearchService.getBrowsePaths(opContext, urnToEntityName(urn), urn)), @@ -848,7 +848,7 @@ public Task deleteEntities( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to delete entities."); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("deleteAll", urns), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "deleteAll", urns), authorizer, auth, true); response.setEntitiesAffected(urns.size()); response.setEntitiesDeleted( @@ -899,7 +899,7 @@ public Task deleteEntity( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to delete entity: " + urnStr); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_DELETE, urn.getEntityType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_DELETE, urn.getEntityType()), authorizer, auth, true); return RestliUtil.toTask( () -> { @@ -961,7 +961,7 @@ private Long deleteTimeseriesAspects( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to delete entity " + urn); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("deleteTimeseriesAspects", urn.getEntityType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "deleteTimeseriesAspects", urn.getEntityType()), authorizer, auth, true); // Construct the filter. List criteria = new ArrayList<>(); @@ -1017,7 +1017,7 @@ public Task deleteReferencesTo( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to delete entity " + urnStr); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("deleteReferences", urn.getEntityType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "deleteReferences", urn.getEntityType()), authorizer, auth, true); return RestliUtil.toTask( () -> deleteEntityService.deleteReferencesTo(opContext, urn, dryRun), @@ -1062,7 +1062,7 @@ public Task getTotalEntityCount(@ActionParam(PARAM_ENTITY) @Nonnull String HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity counts."); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("getTotalEntityCount", entityName), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "getTotalEntityCount", entityName), authorizer, auth, true); return RestliUtil.toTask(() -> entitySearchService.docCount(opContext, entityName)); } @@ -1080,7 +1080,7 @@ public Task batchGetTotalEntityCount( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity counts."); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("batchGetTotalEntityCount", entityNames), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "batchGetTotalEntityCount", entityNames), authorizer, auth, true); return RestliUtil.toTask( () -> new LongMap(searchService.docCountPerEntity(opContext, Arrays.asList(entityNames)))); } @@ -1103,7 +1103,7 @@ public Task listUrns( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to search."); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_LIST_URNS, entityName), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_LIST_URNS, entityName), authorizer, auth, true); log.info("LIST URNS for {} with start {} and count {}", entityName, start, count); return RestliUtil.toTask(() -> { @@ -1145,7 +1145,7 @@ public Task applyRetention( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to apply retention."); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_APPLY_RETENTION, resourceSpec.getType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_APPLY_RETENTION, resourceSpec.getType()), authorizer, auth, true); return RestliUtil.toTask( () -> entityService.batchApplyRetention(opContext, start, count, attemptWithVersion, aspectName, urn), @@ -1171,7 +1171,7 @@ public Task filter( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to search."); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_FILTER, entityName), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_FILTER, entityName), authorizer, auth, true); log.info("FILTER RESULTS for {} with filter {}", entityName, filter); return RestliUtil.toTask( () -> { @@ -1206,7 +1206,7 @@ public Task exists(@ActionParam(PARAM_URN) @Nonnull String urnStr, @Act HttpStatus.S_403_FORBIDDEN, "User is unauthorized check entity existence: " + urnStr); } OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_EXISTS, urn.getEntityType()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_EXISTS, urn.getEntityType()), authorizer, auth, true); log.info("EXISTS for {}", urnStr); final boolean includeRemoved = includeSoftDelete == null || includeSoftDelete; return RestliUtil.toTask( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index af74040e774e5b..9052f0240266ad 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -79,7 +79,7 @@ public Task get( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity " + urn); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("getEntityV2", urn.getEntityType()), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "getEntityV2", urn.getEntityType()), _authorizer, auth, true); return RestliUtil.toTask( () -> { @@ -123,7 +123,7 @@ public Task> batchGet( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entities " + urnStrs); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("getEntityV2", urns.stream().map(Urn::getEntityType).collect(Collectors.toList())), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "getEntityV2", urns.stream().map(Urn::getEntityType).collect(Collectors.toList())), _authorizer, auth, true); if (urns.size() <= 0) { return Task.value(Collections.emptyMap()); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java index 794115ec17d1e4..d6c91ba7dcaa35 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java @@ -88,7 +88,7 @@ public Task> batchGetVersioned( "User is unauthorized to get entities " + versionedUrnStrs); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("authorizerChain", urns.stream() + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "authorizerChain", urns.stream() .map(Urn::getEntityType).collect(Collectors.toList())), _authorizer, auth, true); log.debug("BATCH GET VERSIONED V2 {}", versionedUrnStrs); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java index c92f7207aa6552..42d0bf11c505d8 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java @@ -107,7 +107,7 @@ public Task restoreIndices( @ActionParam("gePitEpochMs") @Optional @Nullable Long gePitEpochMs, @ActionParam("lePitEpochMs") @Optional @Nullable Long lePitEpochMs) { return RestliUtil.toTask( - () -> Utils.restoreIndices(systemOperationContext, + () -> Utils.restoreIndices(systemOperationContext, getContext(), aspectName, urn, urnLike, start, batchSize, limit, gePitEpochMs, lePitEpochMs, _authorizer, _entityService), MetricRegistry.name(this.getClass(), "restoreIndices")); } @@ -202,7 +202,7 @@ public Task getIndexSizes() { HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get index sizes."); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_GET_INDEX_SIZES, List.of()), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_GET_INDEX_SIZES, List.of()), _authorizer, auth, true); TimeseriesIndicesSizesResult result = new TimeseriesIndicesSizesResult(); result.setIndexSizes( @@ -232,7 +232,7 @@ String executeTruncateTimeseriesAspect( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to truncate timeseries index"); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("executeTruncateTimeseriesAspect", entityType), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), "executeTruncateTimeseriesAspect", entityType), _authorizer, auth, true); if (forceDeleteByQuery != null && forceDeleteByQuery.equals(forceReindex)) { return "please only set forceReindex OR forceDeleteByQuery flags"; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java index 2c411f9ad960ee..54c1029edcab04 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java @@ -13,6 +13,7 @@ import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesResult; import com.linkedin.restli.common.HttpStatus; +import com.linkedin.restli.server.ResourceContext; import com.linkedin.restli.server.RestLiServiceException; import java.util.HashMap; import java.util.List; @@ -32,7 +33,8 @@ public class Utils { private Utils() {} public static String restoreIndices( - @Nonnull OperationContext systemOperationContext, + @Nonnull OperationContext systemOperationContext, + @Nonnull ResourceContext resourceContext, @Nonnull String aspectName, @Nullable String urn, @Nullable String urnLike, @@ -59,7 +61,7 @@ public static String restoreIndices( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to restore indices."); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli("restoreIndices", List.of()), authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), resourceContext, "restoreIndices", List.of()), authorizer, auth, true); RestoreIndicesArgs args = new RestoreIndicesArgs() diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java index 82e0e000a31be7..518dfecd576808 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java @@ -141,7 +141,7 @@ public Task batchIngest(@ActionParam(PARAM_BUCKETS) @Nonnull UsageAggregat HttpStatus.S_403_FORBIDDEN, "User is unauthorized to edit entities."); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_BATCH_INGEST, urns.stream() + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_BATCH_INGEST, urns.stream() .map(Urn::getEntityType).collect(Collectors.toList())), _authorizer, auth, true); for (UsageAggregation agg : buckets) { @@ -180,7 +180,7 @@ public Task query( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to query usage."); } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_QUERY, resourceUrn.getEntityType()), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_QUERY, resourceUrn.getEntityType()), _authorizer, auth, true); return UsageServiceUtil.query(opContext, _timeseriesAspectService, resource, duration, startTime, endTime, maxBuckets); }, @@ -207,7 +207,7 @@ public Task queryRange( } final OperationContext opContext = OperationContext.asSession( - systemOperationContext, RequestContext.builder().buildRestli(ACTION_QUERY_RANGE, resourceUrn.getEntityType()), _authorizer, auth, true); + systemOperationContext, RequestContext.builder().buildRestli(auth.getActor().toUrnStr(), getContext(), ACTION_QUERY_RANGE, resourceUrn.getEntityType()), _authorizer, auth, true); return RestliUtil.toTask( () -> UsageServiceUtil.queryRange(opContext, _timeseriesAspectService, resource, duration, range), MetricRegistry.name(this.getClass(), "queryRange")); From 7fb56ef717e5ca6a3459b8bd9a7e4ba03ce7c402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fully=2Eis=28=ED=92=80=EB=A6=AC=29?= Date: Mon, 1 Jul 2024 15:23:18 +0900 Subject: [PATCH 16/33] docs: hivePlatformAlias is different (#10765) Co-authored-by: Harshal Sheth --- metadata-integration/java/spark-lineage-beta/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-integration/java/spark-lineage-beta/README.md b/metadata-integration/java/spark-lineage-beta/README.md index 90e96a7a7cd511..a4d90b25c27bf4 100644 --- a/metadata-integration/java/spark-lineage-beta/README.md +++ b/metadata-integration/java/spark-lineage-beta/README.md @@ -164,12 +164,12 @@ information like tokens. | spark.datahub.rest.server | ✅ | | Datahub server url eg: | | spark.datahub.rest.token | | | Authentication token. | | spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | -| spark.datahub.rest.rest.max_retries | | 0 | Number of times a request retried if failed | -| spark.datahub.rest.rest.retry_interval | | 10 | Number of seconds to wait between retries | +| spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed | +| spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries | | spark.datahub.metadata.pipeline.platformInstance | | | Pipeline level platform instance | | spark.datahub.metadata.dataset.platformInstance | | | dataset level platform instance (it is usefult to set if you have it in your glue ingestion) | | spark.datahub.metadata.dataset.env | | PROD | [Supported values](https://datahubproject.io/docs/graphql/enums#fabrictype). In all other cases, will fallback to PROD | -| spark.datahub.metadata.table.hive_platform_alias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` | +| spark.datahub.metadata.dataset.hivePlatformAlias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` | | spark.datahub.metadata.include_scheme | | true | Include scheme from the path URI (e.g. hdfs://, s3://) in the dataset URN. We recommend setting this value to false, it is set to true for backwards compatibility with previous versions | | spark.datahub.metadata.remove_partition_pattern | | | Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table | | spark.datahub.coalesce_jobs | | true | Only one datajob(task) will be emitted containing all input and output datasets for the spark application | From 93616f78690ef2b11224f5a5eb5e8b33c7f26c0c Mon Sep 17 00:00:00 2001 From: Tim Drahn Date: Mon, 1 Jul 2024 02:23:41 -0400 Subject: [PATCH 17/33] fix(ingestion): ingest emails as empty if no ldap attribute (#9433) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/ldap.py | 20 +++--- .../integration/ldap/ldap_mces_golden.json | 62 +++++++++++-------- ...ap_mces_golden_deleted_group_stateful.json | 1 - .../ldap_mces_golden_deleted_stateful.json | 1 - .../ldap/ldap_mces_golden_group_stateful.json | 2 - .../ldap/ldap_mces_golden_stateful.json | 1 - .../ldap/ldap_memberof_mces_golden.json | 14 +++-- 7 files changed, 56 insertions(+), 45 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 1368a5b83fe6f7..9c7fba68f263bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -334,10 +334,12 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn manager_ldap = guess_person_ldap(m_attrs, self.config, self.report) m_email = get_attr_or_none( - m_attrs, self.config.user_attrs_map["email"], manager_ldap + m_attrs, self.config.user_attrs_map["email"] ) make_manager_urn = ( - m_email if self.config.use_email_as_username else manager_ldap + m_email + if m_email and self.config.use_email_as_username + else manager_ldap ) except ldap.LDAPError as e: @@ -377,7 +379,7 @@ def build_corp_user_mce( last_name = attrs[self.config.user_attrs_map["lastName"]][0].decode() groups = parse_groups(attrs, self.config.user_attrs_map["memberOf"]) - email = get_attr_or_none(attrs, self.config.user_attrs_map["email"], ldap_user) + email = get_attr_or_none(attrs, self.config.user_attrs_map["email"]) display_name = get_attr_or_none( attrs, self.config.user_attrs_map["displayName"], full_name ) @@ -404,7 +406,9 @@ def build_corp_user_mce( manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None - make_user_urn = email if self.config.use_email_as_username else ldap_user + make_user_urn = ( + email if email and self.config.use_email_as_username else ldap_user + ) user_snapshot = CorpUserSnapshotClass( urn=f"urn:li:corpuser:{make_user_urn}", @@ -438,9 +442,7 @@ def build_corp_group_mce(self, attrs: dict) -> Optional[MetadataChangeEvent]: admins = parse_users(attrs, self.config.group_attrs_map["admins"]) members = parse_users(attrs, self.config.group_attrs_map["members"]) - email = get_attr_or_none( - attrs, self.config.group_attrs_map["email"], full_name - ) + email = get_attr_or_none(attrs, self.config.group_attrs_map["email"]) description = get_attr_or_none( attrs, self.config.group_attrs_map["description"] ) @@ -448,7 +450,9 @@ def build_corp_group_mce(self, attrs: dict) -> Optional[MetadataChangeEvent]: attrs, self.config.group_attrs_map["displayName"] ) - make_group_urn = email if self.config.use_email_as_username else full_name + make_group_urn = ( + email if email and self.config.use_email_as_username else full_name + ) group_snapshot = CorpGroupSnapshotClass( urn=f"urn:li:corpGroup:{make_group_urn}", diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json index 90b3f0119fd48b..fa1a5eebe3e5b0 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden.json @@ -6,7 +6,6 @@ "aspects": [ { "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "simpons-group", "admins": [], "members": [ "urn:li:corpuser:hsimpson", @@ -20,7 +19,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -35,7 +35,6 @@ }, "active": true, "displayName": "Bart Simpson", - "email": "bsimpson", "title": "Mr. Boss", "firstName": "Bart", "lastName": "Simpson", @@ -52,7 +51,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -87,7 +87,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -102,7 +103,6 @@ }, "active": true, "displayName": "Lisa Simpson", - "email": "lsimpson", "firstName": "Lisa", "lastName": "Simpson", "fullName": "Lisa Simpson" @@ -118,7 +118,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -133,7 +134,6 @@ }, "active": true, "displayName": "Maggie Simpson", - "email": "msimpson", "firstName": "Maggie", "lastName": "Simpson", "fullName": "Maggie Simpson" @@ -149,7 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -164,7 +165,6 @@ }, "active": true, "displayName": "Hester Bevan", - "email": "hbevan", "firstName": "Hester", "lastName": "Bevan", "fullName": "Hester Bevan" @@ -180,7 +180,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -195,7 +196,6 @@ }, "active": true, "displayName": "Evalyn Haas", - "email": "ehaas", "firstName": "Evalyn", "lastName": "Haas", "fullName": "Evalyn Haas" @@ -211,7 +211,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -221,7 +222,6 @@ "aspects": [ { "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "HR Department", "admins": [], "members": [], "groups": [] @@ -232,7 +232,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -242,7 +243,6 @@ "aspects": [ { "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "Finance Department", "admins": [], "members": [], "groups": [] @@ -253,7 +253,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -268,7 +269,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -283,7 +285,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -298,7 +301,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +317,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +333,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -343,7 +349,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -358,7 +365,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -373,7 +381,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -388,7 +397,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_group_stateful.json b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_group_stateful.json index 2718fcf3c87a45..4abeec224c0d94 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_group_stateful.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_group_stateful.json @@ -6,7 +6,6 @@ "aspects": [ { "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "HR Department", "admins": [], "members": [], "groups": [] diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_stateful.json index 3b52dddad4ff09..b72e1da740be42 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_deleted_stateful.json @@ -9,7 +9,6 @@ "customProperties": {}, "active": true, "displayName": "Bart Simpson", - "email": "bsimpson", "title": "Mr. Boss", "firstName": "Bart", "lastName": "Simpson", diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_group_stateful.json b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_group_stateful.json index ddc1520c7d7487..c4f8e20662bec4 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_group_stateful.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_group_stateful.json @@ -6,7 +6,6 @@ "aspects": [ { "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "HR Department", "admins": [], "members": [], "groups": [] @@ -29,7 +28,6 @@ "aspects": [ { "com.linkedin.pegasus2avro.identity.CorpGroupInfo": { - "email": "Finance Department", "admins": [], "members": [], "groups": [] diff --git a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_stateful.json b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_stateful.json index 1a3e4f4a8262fd..2e7dd6710fe4f6 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_stateful.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_mces_golden_stateful.json @@ -9,7 +9,6 @@ "customProperties": {}, "active": true, "displayName": "Bart Simpson", - "email": "bsimpson", "title": "Mr. Boss", "firstName": "Bart", "lastName": "Simpson", diff --git a/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json b/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json index 016cbe49d77410..445c86b27222d5 100644 --- a/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json +++ b/metadata-ingestion/tests/integration/ldap/ldap_memberof_mces_golden.json @@ -9,7 +9,6 @@ "customProperties": {}, "active": true, "displayName": "Hester Bevan", - "email": "hbevan", "firstName": "Hester", "lastName": "Bevan", "fullName": "Hester Bevan" @@ -28,7 +27,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -41,7 +41,6 @@ "customProperties": {}, "active": true, "displayName": "Evalyn Haas", - "email": "ehaas", "firstName": "Evalyn", "lastName": "Haas", "fullName": "Evalyn Haas" @@ -60,7 +59,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -75,7 +75,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } }, { @@ -90,7 +91,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "ldap-test" + "runId": "ldap-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file From 8edc94d4d187480234e0b5adefdde7b4c4494a58 Mon Sep 17 00:00:00 2001 From: ipolding-cais <155455744+ipolding-cais@users.noreply.github.com> Date: Mon, 1 Jul 2024 07:37:39 +0100 Subject: [PATCH 18/33] fix(patch): consider escaped characters when applying JSON patches (#10717) Co-authored-by: John Joyce --- .../aspect/patch/template/TemplateUtil.java | 8 ++- .../template/UpstreamLineageTemplateTest.java | 60 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java index be3fc4c1fc9836..d4e94e1e82e8f6 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java @@ -85,7 +85,8 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch // Skip first as it will always be blank due to path starting with / for (int i = 1; i < endIdx; i++) { if (parent.get(keys[i]) == null) { - ((ObjectNode) parent).set(keys[i], instance.objectNode()); + String decodedKey = decodeValue(keys[i]); + ((ObjectNode) parent).set(decodedKey, instance.objectNode()); } parent = parent.get(keys[i]); } @@ -93,4 +94,9 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch return transformedNodeClone; } + + /** Simply decode a JSON-patch encoded value * */ + private static String decodeValue(String value) { + return value.replace("~1", "/").replace("~0", "~"); + } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java index 8c7bfc98b2673f..5042c35d2f5d47 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java @@ -4,6 +4,7 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; @@ -254,4 +255,63 @@ public void testLargePatchStandard() throws Exception { assertEquals(result.getUpstreams().size(), 187, "Expected 1 less upstream"); assertEquals(result.getFineGrainedLineages().size(), 607); } + + @Test + public void testPatchWithFieldWithForwardSlash() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"; + String unescapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash/column)"; + String escapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash~1column)"; + String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn; + + UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate(); + UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build()); + + // Initial population test + UpstreamLineage result = + upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatchBuilder.build()); + + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), + unescapedUpstreamUrn); + } + + @Test + public void testPatchWithFieldWithTilde() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"; + String unescapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~column)"; + String escapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~0column)"; + String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn; + + UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate(); + UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build()); + + // Initial population test + UpstreamLineage result = + upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatchBuilder.build()); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), + unescapedUpstreamUrn); + } } From ae4ca4b6eb16d6710ef911e68f44c2bb48e28236 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:32:01 -0500 Subject: [PATCH 19/33] fix(plugin): include ancestors when loading Spring custom plugin (#10809) --- .../datahub/plugins/metadata/aspect/SpringPluginFactory.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java b/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java index dcedbec50b7938..043b0016abaaae 100644 --- a/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java +++ b/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java @@ -14,6 +14,7 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.BeanFactoryUtils; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.AnnotationConfigApplicationContext; @@ -109,7 +110,9 @@ protected List build( final List plugins; if (config.getSpring().getName() == null) { plugins = - springApplicationContext.getBeansOfType(clazz).values().stream() + BeanFactoryUtils.beansOfTypeIncludingAncestors(springApplicationContext, clazz) + .values() + .stream() .map(plugin -> (T) plugin) .collect(Collectors.toList()); } else { From 9c5bea34cdc43ff9ef2299a69f5b30fc821634f1 Mon Sep 17 00:00:00 2001 From: Brandon Brown Date: Mon, 1 Jul 2024 12:11:50 -0400 Subject: [PATCH 20/33] =?UTF-8?q?feat(docker/quickstart):=20Adding=20in=20?= =?UTF-8?q?support=20for=20overriding=20the=20conflue=E2=80=A6=20(#10533)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker/quickstart/docker-compose-m1.quickstart.yml | 6 +++--- .../docker-compose-without-neo4j-m1.quickstart.yml | 6 +++--- .../quickstart/docker-compose-without-neo4j.quickstart.yml | 6 +++--- docker/quickstart/docker-compose.quickstart.yml | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 25779d7f058ea1..834d55096468f6 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -24,7 +24,7 @@ services: test: nc -z broker $${DATAHUB_KAFKA_BROKER_PORT:-9092} timeout: 5s hostname: broker - image: confluentinc/cp-kafka:7.4.0 + image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092 volumes: @@ -271,7 +271,7 @@ services: test: nc -z schema-registry ${DATAHUB_SCHEMA_REGISTRY_PORT:-8081} timeout: 5s hostname: schema-registry - image: confluentinc/cp-schema-registry:7.4.0 + image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081 zookeeper: @@ -285,7 +285,7 @@ services: test: echo srvr | nc zookeeper $${DATAHUB_ZK_PORT:-2181} timeout: 5s hostname: zookeeper - image: confluentinc/cp-zookeeper:7.4.0 + image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 8e3b4b166ce9c1..47fb50f78e4f0c 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -24,7 +24,7 @@ services: test: nc -z broker $${DATAHUB_KAFKA_BROKER_PORT:-9092} timeout: 5s hostname: broker - image: confluentinc/cp-kafka:7.4.0 + image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092 volumes: @@ -245,7 +245,7 @@ services: test: nc -z schema-registry ${DATAHUB_SCHEMA_REGISTRY_PORT:-8081} timeout: 5s hostname: schema-registry - image: confluentinc/cp-schema-registry:7.4.0 + image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081 zookeeper: @@ -259,7 +259,7 @@ services: test: echo srvr | nc zookeeper $${DATAHUB_ZK_PORT:-2181} timeout: 5s hostname: zookeeper - image: confluentinc/cp-zookeeper:7.4.0 + image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index d2c2be7e8948d2..3fa13a9e56b421 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -24,7 +24,7 @@ services: test: nc -z broker $${DATAHUB_KAFKA_BROKER_PORT:-9092} timeout: 5s hostname: broker - image: confluentinc/cp-kafka:7.4.0 + image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092 volumes: @@ -245,7 +245,7 @@ services: test: nc -z schema-registry ${DATAHUB_SCHEMA_REGISTRY_PORT:-8081} timeout: 5s hostname: schema-registry - image: confluentinc/cp-schema-registry:7.4.0 + image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081 zookeeper: @@ -259,7 +259,7 @@ services: test: echo srvr | nc zookeeper $${DATAHUB_ZK_PORT:-2181} timeout: 5s hostname: zookeeper - image: confluentinc/cp-zookeeper:7.4.0 + image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index dfe3c5f4f905a6..c63b6d1d61b030 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -24,7 +24,7 @@ services: test: nc -z broker $${DATAHUB_KAFKA_BROKER_PORT:-9092} timeout: 5s hostname: broker - image: confluentinc/cp-kafka:7.4.0 + image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092 volumes: @@ -271,7 +271,7 @@ services: test: nc -z schema-registry ${DATAHUB_SCHEMA_REGISTRY_PORT:-8081} timeout: 5s hostname: schema-registry - image: confluentinc/cp-schema-registry:7.4.0 + image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081 zookeeper: @@ -285,7 +285,7 @@ services: test: echo srvr | nc zookeeper $${DATAHUB_ZK_PORT:-2181} timeout: 5s hostname: zookeeper - image: confluentinc/cp-zookeeper:7.4.0 + image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0} ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: From c7f83a3df403a90f2f4118404f01006b23654de0 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Mon, 1 Jul 2024 09:59:44 -0700 Subject: [PATCH 21/33] feat(ui): Add support for structured reporting of warnings and failures in the UI ingestion flow (ingest uplift 2/2) (#10790) Co-authored-by: John Joyce Co-authored-by: John Joyce Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../ingest/source/IngestionSourceTable.tsx | 4 +- .../ExecutionRequestDetailsModal.tsx | 35 ++-- .../executions/IngestionExecutionTable.tsx | 4 +- .../executions/reporting/StructuredReport.tsx | 44 ++++ .../reporting/StructuredReportItem.tsx | 81 ++++++++ .../reporting/StructuredReportItemContext.tsx | 44 ++++ .../reporting/StructuredReportItemList.tsx | 42 ++++ .../src/app/ingest/source/types.ts | 195 ++++++++++++++++++ .../src/app/ingest/source/utils.ts | 172 +++++++++++++-- .../src/app/shared/ShowMoreSection.tsx | 31 +++ .../src/graphql/ingestion.graphql | 8 + 11 files changed, 629 insertions(+), 31 deletions(-) create mode 100644 datahub-web-react/src/app/ingest/source/executions/reporting/StructuredReport.tsx create mode 100644 datahub-web-react/src/app/ingest/source/executions/reporting/StructuredReportItem.tsx create mode 100644 datahub-web-react/src/app/ingest/source/executions/reporting/StructuredReportItemContext.tsx create mode 100644 datahub-web-react/src/app/ingest/source/executions/reporting/StructuredReportItemList.tsx create mode 100644 datahub-web-react/src/app/ingest/source/types.ts create mode 100644 datahub-web-react/src/app/shared/ShowMoreSection.tsx diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceTable.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceTable.tsx index ad1e7f6425062a..00d04ed245edfa 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceTable.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceTable.tsx @@ -3,7 +3,7 @@ import React from 'react'; import styled from 'styled-components/macro'; import { StyledTable } from '../../entity/shared/components/styled/StyledTable'; import { ANTD_GRAY } from '../../entity/shared/constants'; -import { CLI_EXECUTOR_ID } from './utils'; +import { CLI_EXECUTOR_ID, getIngestionSourceStatus } from './utils'; import { LastStatusColumn, TypeColumn, @@ -123,7 +123,7 @@ function IngestionSourceTable({ lastExecStatus: source.executions && source.executions?.executionRequests.length > 0 && - source.executions?.executionRequests[0].result?.status, + getIngestionSourceStatus(source.executions?.executionRequests[0].result), cliIngestion: source.config?.executorId === CLI_EXECUTOR_ID, })); diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index 0799f8af1173dc..6711f0ad12b03c 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -13,9 +13,13 @@ import { getExecutionRequestStatusDisplayText, getExecutionRequestStatusIcon, getExecutionRequestSummaryText, + getIngestionSourceStatus, + getStructuredReport, RUNNING, SUCCESS, } from '../utils'; +import { ExecutionRequestResult } from '../../../../types.generated'; +import { StructuredReport } from './reporting/StructuredReport'; const StyledTitle = styled(Typography.Title)` padding: 0px; @@ -125,26 +129,30 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { }; const logs = (showExpandedLogs && output) || output?.split('\n').slice(0, 5).join('\n'); - const result = data?.executionRequest?.result?.status; + const result = data?.executionRequest?.result as Partial; + const status = getIngestionSourceStatus(result); useEffect(() => { const interval = setInterval(() => { - if (result === RUNNING) refetch(); + if (status === RUNNING) refetch(); }, 2000); return () => clearInterval(interval); }); - const ResultIcon = result && getExecutionRequestStatusIcon(result); - const resultColor = result && getExecutionRequestStatusDisplayColor(result); - const resultText = result && ( + const ResultIcon = status && getExecutionRequestStatusIcon(status); + const resultColor = status && getExecutionRequestStatusDisplayColor(status); + const resultText = status && ( {ResultIcon && } - {getExecutionRequestStatusDisplayText(result)} + {getExecutionRequestStatusDisplayText(status)} ); + + const structuredReport = result && getStructuredReport(result); + const resultSummaryText = - (result && {getExecutionRequestSummaryText(result)}) || + (status && {getExecutionRequestSummaryText(status)}) || undefined; const recipeJson = data?.executionRequest?.input.arguments?.find((arg) => arg.key === 'recipe')?.value; @@ -167,21 +175,22 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { bodyStyle={modalBodyStyle} title={ - Ingestion Run Details + Sync Details } visible={visible} onCancel={onClose} > - {!data && loading && } - {error && message.error('Failed to load execution details :(')} + {!data && loading && } + {error && message.error('Failed to load sync details :(')}
Status {resultText} {resultSummaryText} + {structuredReport ? : null} - {result === SUCCESS && ( + {status === SUCCESS && ( {data?.executionRequest?.id && } @@ -190,7 +199,7 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { Logs - View logs that were collected during the ingestion run. + View logs that were collected during the sync.
@@ -191,6 +209,7 @@ export const CreateScheduleStep = ({ state, updateState, goTo, prev }: StepProps data-testid="ingestion-schedule-next-button" disabled={!interval || interval.length === 0 || cronAsText.error} onClick={onClickNext} + type="primary" > Next diff --git a/datahub-web-react/src/app/ingest/source/builder/DataPlatformCard.tsx b/datahub-web-react/src/app/ingest/source/builder/DataPlatformCard.tsx new file mode 100644 index 00000000000000..34efbb30008294 --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/builder/DataPlatformCard.tsx @@ -0,0 +1,68 @@ +import React from 'react'; +import { Button, Image } from 'antd'; +import styled from 'styled-components'; + +import { REDESIGN_COLORS } from '../../../entity/shared/constants'; + +const Container = styled(Button)` + padding: 32px; + height: 200px; + display: flex; + justify-content: center; + border-radius: 8px; + align-items: start; + flex-direction: column; + border: 1px solid #e0e0e0; + background-color: #ffffff; + &&:hover { + border: 1px solid ${REDESIGN_COLORS.BLUE}; + background-color: #ffffff; + } + white-space: unset; +`; + +const PlatformLogo = styled(Image)` + max-height: 32px; + height: 32px; + width: auto; + object-fit: contain; + background-color: transparent; +`; + +const LogoContainer = styled.div` + margin-bottom: 14px; +`; + +const Title = styled.div` + word-break: break-word; + color: #464646; + font-weight: bold; + font-size: 16px; + margin-bottom: 8px; +`; + +const Description = styled.div` + word-break: break-word; + text-align: left; + color: #7c7c7c; +`; + +type Props = { + logoUrl?: string; + logoComponent?: React.ReactNode; + name: string; + description?: string; + onClick?: () => void; +}; + +export const DataPlatformCard = ({ logoUrl, logoComponent, name, description, onClick }: Props) => { + return ( + + + {(logoUrl && ) || logoComponent} + + {name} + {description} + + ); +}; diff --git a/datahub-web-react/src/app/ingest/source/builder/DefineRecipeStep.tsx b/datahub-web-react/src/app/ingest/source/builder/DefineRecipeStep.tsx index 4ff4623b548c92..c16193b061b793 100644 --- a/datahub-web-react/src/app/ingest/source/builder/DefineRecipeStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/DefineRecipeStep.tsx @@ -164,7 +164,7 @@ export const DefineRecipeStep = ({ state, updateState, goTo, prev, ingestionSour - diff --git a/datahub-web-react/src/app/ingest/source/builder/IngestionDocumentationHint.tsx b/datahub-web-react/src/app/ingest/source/builder/IngestionDocumentationHint.tsx new file mode 100644 index 00000000000000..bda3d7f7424afd --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/builder/IngestionDocumentationHint.tsx @@ -0,0 +1,68 @@ +import React from 'react'; +import styled from 'styled-components'; +import { Button, Tooltip } from 'antd'; +import { CloseOutlined } from '@ant-design/icons'; + +import { SourceConfig } from './types'; +import { ANTD_GRAY } from '../../../entity/shared/constants'; + +const Container = styled.div` + background-color: #ffffff; + border-radius: 8px; + padding: 12px 12px 16px 24px; + border: 1px solid #e0e0e0; + margin-bottom: 20px; +`; + +const Header = styled.div` + display: flex; + align-items: center; + justify-content: space-between; + margin-bottom: 12px; +`; + +const Title = styled.div` + font-size: 16px; + font-weight: bold; +`; + +const Description = styled.div` + font-size: 14px; + max-width: 90%; +`; + +const StyledCloseOutlined = styled(CloseOutlined)` + color: ${ANTD_GRAY[6]}; +`; + +interface Props { + sourceConfigs: SourceConfig; + onHide: () => void; +} + +export const IngestionDocumentationHint = ({ sourceConfigs, onHide }: Props) => { + const { displayName, docsUrl } = sourceConfigs; + return ( + +
+ Let's get connected! 🎉 + +
+ +
+ To import from {displayName}, we'll need some more information to connect to your instance. +
+
+ Check out the{' '} + + {displayName} Guide + {' '} + to understand the prerequisites, learn about available settings, and view examples to help connect + to the data source. +
+
+
+ ); +}; diff --git a/datahub-web-react/src/app/ingest/source/builder/IngestionSourceBuilderModal.tsx b/datahub-web-react/src/app/ingest/source/builder/IngestionSourceBuilderModal.tsx index 5a623b58af5c96..a41a8ec0f12ab9 100644 --- a/datahub-web-react/src/app/ingest/source/builder/IngestionSourceBuilderModal.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/IngestionSourceBuilderModal.tsx @@ -1,8 +1,7 @@ -import { Button, Modal, Steps, Typography } from 'antd'; +import { Modal, Steps, Typography } from 'antd'; import React, { useEffect, useRef, useState } from 'react'; import styled from 'styled-components'; import { isEqual } from 'lodash'; -import { ExpandAltOutlined, ShrinkOutlined } from '@ant-design/icons'; import { SourceBuilderState, StepProps } from './types'; import { CreateScheduleStep } from './CreateScheduleStep'; import { DefineRecipeStep } from './DefineRecipeStep'; @@ -10,15 +9,18 @@ import { NameSourceStep } from './NameSourceStep'; import { SelectTemplateStep } from './SelectTemplateStep'; import sourcesJson from './sources.json'; -const ExpandButton = styled(Button)` - && { - margin-right: 32px; +const StyledModal = styled(Modal)` + && .ant-modal-content { + border-radius: 16px; + overflow: hidden; + min-width: 400px; } `; const TitleContainer = styled.div` display: flex; justify-content: space-between; + border-radius: 12px; `; const StepsContainer = styled.div` @@ -31,9 +33,9 @@ const StepsContainer = styled.div` * Mapping from the step type to the title for the step */ export enum IngestionSourceBuilderStepTitles { - SELECT_TEMPLATE = 'Choose Type', - DEFINE_RECIPE = 'Configure Recipe', - CREATE_SCHEDULE = 'Schedule Ingestion', + SELECT_TEMPLATE = 'Choose Data Source', + DEFINE_RECIPE = 'Configure Connection', + CREATE_SCHEDULE = 'Sync Schedule', NAME_SOURCE = 'Finish up', } @@ -57,6 +59,8 @@ export enum IngestionSourceBuilderStep { NAME_SOURCE = 'NAME_SOURCE', } +const modalBodyStyle = { padding: '16px 24px 16px 24px', backgroundColor: '#F6F6F6' }; + type Props = { initialState?: SourceBuilderState; visible: boolean; @@ -66,14 +70,17 @@ type Props = { export const IngestionSourceBuilderModal = ({ initialState, visible, onSubmit, onCancel }: Props) => { const isEditing = initialState !== undefined; - const titleText = isEditing ? 'Edit Ingestion Source' : 'New Ingestion Source'; + const titleText = isEditing ? 'Edit Data Source' : 'Connect Data Source'; const initialStep = isEditing ? IngestionSourceBuilderStep.DEFINE_RECIPE : IngestionSourceBuilderStep.SELECT_TEMPLATE; const [stepStack, setStepStack] = useState([initialStep]); - const [modalExpanded, setModalExpanded] = useState(false); - const [ingestionBuilderState, setIngestionBuilderState] = useState({}); + const [ingestionBuilderState, setIngestionBuilderState] = useState({ + schedule: { + interval: '0 0 * * *', + }, + }); const ingestionSources = JSON.parse(JSON.stringify(sourcesJson)); // TODO: replace with call to server once we have access to dynamic list of sources @@ -122,28 +129,28 @@ export const IngestionSourceBuilderModal = ({ initialState, visible, onSubmit, o const StepComponent: React.FC = IngestionSourceBuilderStepComponent[currentStep]; return ( - {titleText} - setModalExpanded(!modalExpanded)}> - {(modalExpanded && ) || } - } style={{ top: 40 }} + bodyStyle={modalBodyStyle} visible={visible} onCancel={onCancel} > - - - {Object.keys(IngestionSourceBuilderStep).map((item) => ( - - ))} - - + {currentStepIndex > 0 ? ( + + + {Object.keys(IngestionSourceBuilderStep).map((item) => ( + + ))} + + + ) : null} - + ); }; diff --git a/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx b/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx index 5573e5a3e39040..898fbd6a6d9268 100644 --- a/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx @@ -1,7 +1,8 @@ -import { Button, Checkbox, Collapse, Form, Input, Typography } from 'antd'; +import { Button, Checkbox, Collapse, Form, Input, Tooltip, Typography } from 'antd'; import React from 'react'; import styled from 'styled-components'; import { SourceBuilderState, StepProps, StringMapEntryInput } from './types'; +import { RequiredFieldForm } from '../../../shared/form/RequiredFieldForm'; const ControlsContainer = styled.div` display: flex; @@ -156,7 +157,7 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) return ( <> -
+ - Give this ingestion source a name. + Give this data source a name - +
@@ -263,13 +264,15 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) > Save - + + +
diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx b/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx index 880420386fa67e..2d0bfe340c5065 100644 --- a/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx @@ -10,6 +10,7 @@ import { SourceBuilderState, SourceConfig } from './types'; import { CSV, LOOKER, LOOK_ML } from './constants'; import { LookerWarning } from './LookerWarning'; import { CSVInfo } from './CSVInfo'; +import { IngestionDocumentationHint } from './IngestionDocumentationHint'; export const ControlsContainer = styled.div` display: flex; @@ -66,6 +67,7 @@ function RecipeBuilder(props: Props) { const { state, isEditing, displayRecipe, sourceConfigs, setStagedRecipe, onClickNext, goToPrevious } = props; const { type } = state; const [isViewingForm, setIsViewingForm] = useState(true); + const [hideDocsHint, setHideDocsHint] = useState(false); function switchViews(isFormView: boolean) { try { @@ -81,12 +83,14 @@ function RecipeBuilder(props: Props) { return (
+ {!hideDocsHint && isViewingForm && sourceConfigs ? ( + setHideDocsHint(true)} sourceConfigs={sourceConfigs} /> + ) : null} {(type === LOOKER || type === LOOK_ML) && } {type === CSV && } - - {sourceConfigs?.displayName} Recipe + {sourceConfigs?.displayName} Details Previous - diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/RecipeForm.tsx b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/RecipeForm.tsx index bdee01d6498ee7..4199658568b9a6 100644 --- a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/RecipeForm.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/RecipeForm.tsx @@ -1,9 +1,11 @@ -import { Button, Collapse, Form, message, Tooltip, Typography } from 'antd'; import React, { Fragment } from 'react'; + +import { Button, Collapse, Form, message, Tooltip, Typography } from 'antd'; import { get } from 'lodash'; import YAML from 'yamljs'; import { ApiOutlined, FilterOutlined, QuestionCircleOutlined, SettingOutlined } from '@ant-design/icons'; import styled from 'styled-components/macro'; + import { jsonToYaml } from '../../utils'; import { CONNECTORS_WITH_TEST_CONNECTION, RecipeSections, RECIPE_FIELDS } from './constants'; import FormField from './FormField'; @@ -11,6 +13,7 @@ import TestConnectionButton from './TestConnection/TestConnectionButton'; import { useListSecretsQuery } from '../../../../../graphql/ingestion.generated'; import { RecipeField, setFieldValueOnRecipe } from './common'; import { SourceBuilderState, SourceConfig } from '../types'; +import { RequiredFieldForm } from '../../../../shared/form/RequiredFieldForm'; export const ControlsContainer = styled.div` display: flex; @@ -140,7 +143,7 @@ function RecipeForm(props: Props) { } return ( -
} - text="Advanced" + text="Settings" sectionTooltip={advancedSectionTooltip} /> } @@ -230,9 +233,11 @@ function RecipeForm(props: Props) { - + -
+ ); } diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/common.tsx b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/common.tsx index 43d899301c2fc7..cbaf2f4d87991f 100644 --- a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/common.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/common.tsx @@ -276,7 +276,7 @@ export const INCLUDE_LINEAGE: RecipeField = { export const INCLUDE_TABLE_LINEAGE: RecipeField = { name: 'include_table_lineage', label: 'Include Table Lineage', - tooltip: 'Extract Tabel-Level lineage metadata. Enabling this may increase the duration of the extraction process.', + tooltip: 'Extract Tabel-Level lineage metadata. Enabling this may increase the duration of the sync.', type: FieldType.BOOLEAN, fieldPath: 'source.config.include_table_lineage', rules: null, @@ -286,8 +286,7 @@ const isProfilingEnabledFieldPath = 'source.config.profiling.enabled'; export const TABLE_PROFILING_ENABLED: RecipeField = { name: 'profiling.enabled', label: 'Enable Table Profiling', - tooltip: - 'Generate Data Profiles for extracted Tables. Enabling this may increase the duration of the extraction process.', + tooltip: 'Generate Data Profiles for extracted Tables. Enabling this may increase the duration of the sync.', type: FieldType.BOOLEAN, fieldPath: isProfilingEnabledFieldPath, rules: null, @@ -298,7 +297,7 @@ export const COLUMN_PROFILING_ENABLED: RecipeField = { name: 'column_profiling.enabled', label: 'Enable Column Profiling', tooltip: - 'Generate Data Profiles for the Columns in extracted Tables. Enabling this may increase the duration of the extraction process.', + 'Generate Data Profiles for the Columns in extracted Tables. Enabling this may increase the duration of the sync.', type: FieldType.BOOLEAN, fieldPath: isTableProfilingOnlyFieldPath, rules: null, @@ -466,7 +465,7 @@ export const START_TIME: RecipeField = { name: 'start_time', label: 'Start Time', tooltip: - 'Earliest date used when processing audit logs for lineage, usage, and more. Default: Last full day in UTC or last time DataHub ingested usage (if stateful ingestion is enabled). Tip: Set this to an older date (e.g. 1 month ago) to bootstrap your first ingestion run, and then reduce for subsequent runs. Changing this may increase the duration of the extraction process.', + 'Earliest date used when processing audit logs for lineage, usage, and more. Default: Last full day in UTC or last time DataHub ingested usage (if stateful ingestion is enabled). Tip: Set this to an older date (e.g. 1 month ago) to bootstrap your first ingestion run, and then reduce for subsequent runs. Changing this may increase the duration of the sync.', placeholder: 'Select date and time', type: FieldType.DATE, fieldPath: startTimeFieldPath, diff --git a/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx b/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx index 6b771d459c4ef9..3998915e07a2ce 100644 --- a/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx @@ -1,39 +1,60 @@ +import React, { useState } from 'react'; + import { Button, Input } from 'antd'; import { FormOutlined, SearchOutlined } from '@ant-design/icons'; -import React, { useState } from 'react'; import styled from 'styled-components'; -import { LogoCountCard } from '../../../shared/LogoCountCard'; import { SourceConfig, SourceBuilderState, StepProps } from './types'; import { IngestionSourceBuilderStep } from './steps'; import useGetSourceLogoUrl from './useGetSourceLogoUrl'; import { CUSTOM } from './constants'; import { ANTD_GRAY } from '../../../entity/shared/constants'; +import { DataPlatformCard } from './DataPlatformCard'; -const Section = styled.div` +const Container = styled.div` + max-height: 82vh; display: flex; flex-direction: column; - padding-bottom: 12px; `; -const PlatformListContainer = styled.div` +const Section = styled.div` display: flex; - justify-content: left; - align-items: center; - flex-wrap: wrap; + flex-direction: column; + padding-bottom: 12px; + overflow: hidden; `; const CancelButton = styled(Button)` - && { - margin-left: 12px; - } + max-width: 120px; +`; + +const SearchBarContainer = styled.div` + display: flex; + justify-content: end; + width: auto; + padding-right: 12px; `; const StyledSearchBar = styled(Input)` background-color: white; - border-radius: 70px; + border-radius: 8px; box-shadow: 0px 0px 30px 0px rgb(239 239 239); - width: 45%; - margin: 0 0 15px 12px; + border: 1px solid #e0e0e0; + margin: 0 0 15px 0px; + max-width: 300px; + font-size: 16px; +`; + +const StyledSearchOutlined = styled(SearchOutlined)` + color: #a9adbd; +`; + +const PlatformListContainer = styled.div` + display: grid; + grid-template-columns: repeat(auto-fill, minmax(min(100%, 31%), 1fr)); + gap: 10px; + height: 100%; + overflow-y: auto; + padding-right: 12px; `; interface SourceOptionProps { @@ -42,7 +63,7 @@ interface SourceOptionProps { } function SourceOption({ source, onClick }: SourceOptionProps) { - const { name, displayName } = source; + const { name, displayName, description } = source; const logoUrl = useGetSourceLogoUrl(name); let logoComponent; @@ -50,7 +71,15 @@ function SourceOption({ source, onClick }: SourceOptionProps) { logoComponent = ; } - return ; + return ( + + ); } /** @@ -76,22 +105,24 @@ export const SelectTemplateStep = ({ state, updateState, goTo, cancel, ingestion ); return ( - <> +
- setSearchFilter(e.target.value)} - allowClear - prefix={} - /> - + + setSearchFilter(e.target.value)} + allowClear + prefix={} + /> + + {filteredSources.map((source) => ( onSelectTemplate(source.name)} /> ))}
Cancel - +
); }; diff --git a/datahub-web-react/src/app/ingest/source/builder/TimezoneSelect.tsx b/datahub-web-react/src/app/ingest/source/builder/TimezoneSelect.tsx index d9f3df1fc99299..21731b69cf46b6 100644 --- a/datahub-web-react/src/app/ingest/source/builder/TimezoneSelect.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/TimezoneSelect.tsx @@ -1,21 +1,28 @@ import { Select } from 'antd'; import React from 'react'; import moment from 'moment-timezone'; +import styled from 'styled-components'; + +const StyledSelect = styled(Select)` + max-width: 300px; +`; type Props = { value: string; - onChange: (newTimezone: string) => void; + onChange: (newTimezone: any) => void; }; export const TimezoneSelect = ({ value, onChange }: Props) => { const timezones = moment.tz.names(); return ( <> - + ); }; diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index d4faf82a20605a..c35a7a033a8ab3 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -3,55 +3,63 @@ "urn": "urn:li:dataPlatform:bigquery", "name": "bigquery", "displayName": "BigQuery", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/bigquery/", + "description": "Import Projects, Datasets, Tables, Views, lineage, queries, and statistics from BigQuery.", + "docsUrl": "https://datahubproject.io/docs/quick-ingestion-guides/bigquery/overview", "recipe": "source:\n type: bigquery\n config:\n include_table_lineage: true\n include_usage_statistics: true\n include_tables: true\n include_views: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, { "urn": "urn:li:dataPlatform:redshift", "name": "redshift", "displayName": "Redshift", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/redshift/", + "description": "Import Tables, Views, Databases, Schemas, lineage, queries, and statistics from Redshift.", + "docsUrl": "https://datahubproject.io/docs/quick-ingestion-guides/redshift/overview", "recipe": "source: \n type: redshift\n config:\n # Coordinates\n host_port: # Your Redshift host and post, e.g. example.something.us-west-2.redshift.amazonaws.com:5439\n database: # Your Redshift database, e.g. SampleDatabase\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your Redshift username, e.g. admin\n\n table_lineage_mode: stl_scan_based\n include_table_lineage: true\n include_tables: true\n include_views: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, { "urn": "urn:li:dataPlatform:snowflake", "name": "snowflake", "displayName": "Snowflake", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/snowflake/", + "description": "Import Tables, Views, Databases, Schemas, lineage, queries, and statistics from Snowflake.", + "docsUrl": "https://datahubproject.io/docs/quick-ingestion-guides/snowflake/overview", "recipe": "source: \n type: snowflake\n config:\n account_id: null\n include_table_lineage: true\n include_view_lineage: true\n include_tables: true\n include_views: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, { - "urn": "urn:li:dataPlatform:kafka", - "name": "kafka", - "displayName": "Kafka", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/kafka/", - "recipe": "source:\n type: kafka\n config:\n connection:\n consumer_config:\n security.protocol: \"PLAINTEXT\"\n stateful_ingestion:\n enabled: false" + "urn": "urn:li:dataPlatform:unity-catalog", + "name": "unity-catalog", + "displayName": "Databricks", + "description": "Import Metastores, Schemas, Tables, lineage, queries, and statistics from Databricks Unity Catalog.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/databricks/#module-unity-catalog", + "recipe": "source:\n type: unity-catalog\n config:\n # Coordinates\n workspace_url: null\n include_table_lineage: true\n include_column_lineage: false\n stateful_ingestion:\n enabled: true" }, { "urn": "urn:li:dataPlatform:looker", "name": "looker", "displayName": "Looker", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/looker/", - "recipe": "source:\n type: looker\n config:\n # Coordinates\n base_url: # Your Looker instance URL, e.g. https://company.looker.com:19999\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n client_id: null # Your Looker client id, e.g. admin\n stateful_ingestion:\n enabled: true" + "description": "Import Models, Explores, Views, Looks, Dashboards, and lineage from Looker.", + "docsUrl": "https://datahubproject.io/docs/quick-ingestion-guides/looker/overview#looker", + "recipe": "source:\n type: looker\n config:\n # Coosrdinates\n base_url: # Your Looker instance URL, e.g. https://company.looker.com:19999\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n client_id: null # Your Looker client id, e.g. admin\n stateful_ingestion:\n enabled: true" }, { "urn": "urn:li:dataPlatform:lookml", "name": "lookml", "displayName": "LookML", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/looker/#module-lookml", + "description": "Import Models, Explores, Views, Looks, Dashboards, and lineage from LookML files.", + "docsUrl": "https://datahubproject.io/docs/quick-ingestion-guides/looker/overview#lookml", "recipe": "source:\n type: lookml\n config:\n parse_table_names_from_sql: true\n stateful_ingestion:\n enabled: true" }, { "urn": "urn:li:dataPlatform:tableau", "name": "tableau", "displayName": "Tableau", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/tableau/", + "description": "Import Data Sources, Workbooks, Worksheets, Tags, Dashboards, and lineage from Tableau.", + "docsUrl": "https://datahubproject.io/docs/quick-ingestion-guides/tableau/overview", "recipe": "source:\n type: tableau\n config:\n # Coordinates\n connect_uri: null\n stateful_ingestion:\n enabled: true" }, { "urn": "urn:li:dataPlatform:powerbi", "name": "powerbi", "displayName": "PowerBI", + "description": "Import Dashboards, Tiles, Datasets, and lineage from PowerBI.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/powerbi/", "recipe": "source:\n type: \"powerbi\"\n config:\n # Your Power BI tenant identifier\n tenant_id: null\n # Your Power BI client id\n client_id: null\n # Your Power BI client secret\n client_secret: null\n stateful_ingestion:\n enabled: true" }, @@ -59,6 +67,7 @@ "urn": "urn:li:dataPlatform:dbt", "name": "dbt-cloud", "displayName": "dbt Cloud", + "description": "Import Sources, Seeds, Models, Snapshots, Tests, and lineage from dbt cloud.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/dbt/#module-dbt-cloud", "recipe": "source:\n type: dbt-cloud\n config:\n account_id: null\n project_id: null\n job_id: null\n target_platform: null\n stateful_ingestion:\n enabled: true" }, @@ -66,6 +75,7 @@ "urn": "urn:li:dataPlatform:mysql", "name": "mysql", "displayName": "MySQL", + "description": "Import Tables, Views, Databases, Schemas, and statistics from MySQL.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mysql/", "recipe": "source: \n type: mysql\n config: \n # Coordinates\n host_port: # Your MySQL host and post, e.g. mysql:3306\n database: # Your MySQL database name, e.g. datahub\n \n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your MySQL username, e.g. admin\n\n # Options\n include_tables: true\n include_views: true\n\n # Profiling\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, @@ -73,13 +83,23 @@ "urn": "urn:li:dataPlatform:postgres", "name": "postgres", "displayName": "Postgres", + "description": "Import Tables, Views, Databases, Schemas, and statistics from Postgres.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/postgres/", "recipe": "source: \n type: postgres\n config:\n # Coordinates\n host_port: # Your Postgres host and port, e.g. postgres:5432\n database: # Your Postgres Database, e.g. sample_db\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your Postgres username, e.g. admin\n\n # Options\n include_tables: true\n include_views: true\n\n # Profiling\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, + { + "urn": "urn:li:dataPlatform:kafka", + "name": "kafka", + "displayName": "Kafka", + "description": "Import streaming topics from Kafka.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/kafka/", + "recipe": "source:\n type: kafka\n config:\n connection:\n consumer_config:\n security.protocol: \"PLAINTEXT\"\n stateful_ingestion:\n enabled: false" + }, { "urn": "urn:li:dataPlatform:hive", "name": "hive", "displayName": "Hive", + "description": "Import Tables, Views, Databases, Schemas, and statistics from Hive.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/hive/", "recipe": "source: \n type: hive\n config:\n # Coordinates\n host_port: # Your Hive host and port, e.g. hive:10000\n database: # Your Hive database name, e.g. SampleDatabase (Optional, if not specified, ingests from all databases)\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your Hive username, e.g. admin\n stateful_ingestion:\n enabled: true" }, @@ -87,6 +107,7 @@ "urn": "urn:li:dataPlatform:presto", "name": "presto", "displayName": "Presto", + "description": "Import Tables, Databases, Schemas, and statistics from Presto.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/presto/", "recipe": "source:\n type: presto\n config:\n # Coordinates\n host_port: null\n # The name of the catalog from getting the usage\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, @@ -94,13 +115,23 @@ "urn": "urn:li:dataPlatform:trino", "name": "trino", "displayName": "Trino", + "description": "Import Tables, Databases, Schemas, and statistics from Trino.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/trino/", "recipe": "source:\n type: trino\n config:\n # Coordinates\n host_port: null\n # The name of the catalog from getting the usage\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, + { + "urn": "urn:li:dataPlatform:glue", + "name": "glue", + "displayName": "Glue", + "description": "Import Tables, Databases, Jobs, statistics, and lineage to S3 from AWS Glue.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/glue/", + "recipe": "source:\n type: glue\n config:\n # AWS credentials. \n aws_region: # The region for your AWS Glue instance. \n # Add secret in Secrets Tab with relevant names for each variable\n # The access key for your AWS account.\n aws_access_key_id: \"${AWS_ACCESS_KEY_ID}\"\n # The secret key for your AWS account.\n aws_secret_access_key: \"${AWS_SECRET_KEY}\"\n aws_session_token: # The session key for your AWS account. This is only needed when you are using temporary credentials.\n # aws_role: # (Optional) The role to assume (Role chaining supported by using a sorted list).\n\n # Allow / Deny specific databases & tables\n # database_pattern:\n # allow:\n # - \"flights-database\"\n # table_pattern:\n # allow:\n # - \"avro\"" + }, { "urn": "urn:li:dataPlatform:mssql", "name": "mssql", "displayName": "Microsoft SQL Server", + "description": "Import Tables, Views, Databases, Schemas, and statistics from SQL Server.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mssql/", "recipe": "source:\n type: mssql\n config:\n # Coordinates\n host_port: null\n # The name\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, @@ -108,20 +139,15 @@ "urn": "urn:li:dataPlatform:mariadb", "name": "mariadb", "displayName": "MariaDB", + "description": "Import Tables, Views, Databases, Schemas, and statistics from MariaDB.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mariadb/", "recipe": "source:\n type: mariadb\n config:\n # Coordinates\n host_port: null\n # The name\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, - { - "urn": "urn:li:dataPlatform:unity-catalog", - "name": "unity-catalog", - "displayName": "Databricks Unity Catalog", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/databricks/#module-unity-catalog", - "recipe": "source:\n type: unity-catalog\n config:\n # Coordinates\n workspace_url: null\n include_table_lineage: true\n include_column_lineage: false\n stateful_ingestion:\n enabled: true" - }, { "urn": "urn:li:dataPlatform:mongodb", "name": "mongodb", "displayName": "MongoDB", + "description": "Import Databases and Collections from MongoDB.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mongodb/", "recipe": "source:\n type: mongodb\n config:\n # Coordinates\n connect_uri: # Your MongoDB connect URI, e.g. \"mongodb://localhost\"\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: \"${MONGO_USERNAME}\" # Your MongoDB username, e.g. admin\n password: \"${MONGO_PASSWORD}\" # Your MongoDB password, e.g. password_01\n\n # Options (recommended)\n enableSchemaInference: True\n useRandomSampling: True\n maxSchemaSize: 300" }, @@ -129,20 +155,15 @@ "urn": "urn:li:dataPlatform:dynamodb", "name": "dynamodb", "displayName": "DynamoDB", + "description": "Import Tables from DynamoDB.", "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # If there are items that have most representative fields of the table, users could use the\n # `include_table_item` option to provide a list of primary keys of the table in dynamodb format.\n # For each `region.table`, the list of primary keys can be at most 100.\n # We include these items in addition to the first 100 items in the table when we scan it.\n # include_table_item:\n # region.table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]" }, - { - "urn": "urn:li:dataPlatform:glue", - "name": "glue", - "displayName": "Glue", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/glue/", - "recipe": "source:\n type: glue\n config:\n # AWS credentials. \n aws_region: # The region for your AWS Glue instance. \n # Add secret in Secrets Tab with relevant names for each variable\n # The access key for your AWS account.\n aws_access_key_id: \"${AWS_ACCESS_KEY_ID}\"\n # The secret key for your AWS account.\n aws_secret_access_key: \"${AWS_SECRET_KEY}\"\n aws_session_token: # The session key for your AWS account. This is only needed when you are using temporary credentials.\n # aws_role: # (Optional) The role to assume (Role chaining supported by using a sorted list).\n\n # Allow / Deny specific databases & tables\n # database_pattern:\n # allow:\n # - \"flights-database\"\n # table_pattern:\n # allow:\n # - \"avro\"" - }, { "urn": "urn:li:dataPlatform:oracle", "name": "oracle", "displayName": "Oracle", + "description": "Import Databases, Schemas, Tables, Views, statistics, and lineage from Oracle.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/oracle/", "recipe": "source: \n type: oracle\n config:\n # Coordinates\n host_port: # Your Oracle host and port, e.g. oracle:5432\n database: # Your Oracle database name, e.g. sample_db\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: \"${ORACLE_USERNAME}\" # Your Oracle username, e.g. admin\n password: \"${ORACLE_PASSWORD}\" # Your Oracle password, e.g. password_01\n\n # Optional service name\n # service_name: # Your service name, e.g. svc # omit database if using this option" }, @@ -150,6 +171,7 @@ "urn": "urn:li:dataPlatform:superset", "name": "superset", "displayName": "Superset", + "description": "Import Charts and Dashboards from Superset", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/superset/", "recipe": "source:\n type: superset\n config:\n # Coordinates\n connect_uri: http://localhost:8088\n\n # Credentials\n username: user\n password: pass\n provider: ldap" }, @@ -157,6 +179,7 @@ "urn": "urn:li:dataPlatform:athena", "name": "athena", "displayName": "Athena", + "description": "Import Schemas, Tables, Views, and lineage to S3 from Athena.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/athena/", "recipe": "source:\n type: athena\n config:\n # Coordinates\n aws_region: my_aws_region\n work_group: primary\n\n # Options\n s3_staging_dir: \"s3://my_staging_athena_results_bucket/results/\"" }, @@ -164,6 +187,7 @@ "urn": "urn:li:dataPlatform:clickhouse", "name": "clickhouse", "displayName": "ClickHouse", + "description": "Import Tables, Views, Materialized Views, Dictionaries, statistics, queries, and lineage from ClickHouse.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/clickhouse/", "recipe": "source:\n type: clickhouse\n config:\n # Coordinates\n host_port: localhost:9000\n\n # Credentials\n username: user\n password: pass\n\n # Options\n platform_instance: DatabaseNameToBeIngested\n\n include_views: true # whether to include views, defaults to True\n include_tables: true # whether to include views, defaults to True\n\nsink:\n # sink configs\n\n#---------------------------------------------------------------------------\n# For the HTTP interface:\n#---------------------------------------------------------------------------\nsource:\n type: clickhouse\n config:\n host_port: localhost:8443\n protocol: https\n\n#---------------------------------------------------------------------------\n# For the Native interface:\n#---------------------------------------------------------------------------\n\nsource:\n type: clickhouse\n config:\n host_port: localhost:9440\n scheme: clickhouse+native\n secure: True" }, @@ -171,13 +195,23 @@ "urn": "urn:li:dataPlatform:druid", "name": "druid", "displayName": "Druid", + "description": "Import Databases, Schemas, Tables, statistics, and lineage from Druid.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/druid/", "recipe": "source:\n type: druid\n config:\n # Coordinates\n host_port: \"localhost:8082\"\n\n # Credentials\n username: admin\n password: password" }, + { + "urn": "urn:li:dataPlatform:mode", + "name": "mode", + "displayName": "Mode", + "description": "Import Reports, Charts, and lineage from Mode.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mode/", + "recipe": "source:\n type: mode\n config:\n # Coordinates\n connect_uri: http://app.mode.com\n\n # Credentials\n token: token\n password: pass\n\n # Options\n workspace: \"datahub\"\n default_schema: \"public\"\n owner_username_instead_of_email: False\n api_options:\n retry_backoff_multiplier: 2\n max_retry_interval: 10\n max_attempts: 5" + }, { "urn": "urn:li:dataPlatform:metabase", "name": "metabase", "displayName": "Metabase", + "description": "Import Collections, Dashboards, and Charts from Metabase.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/metabase/", "recipe": "source:\n type: metabase\n config:\n # Coordinates\n connect_uri:\n\n # Credentials\n username: root\n password: example" }, @@ -185,20 +219,15 @@ "urn": "urn:li:dataPlatform:mlflow", "name": "mlflow", "displayName": "MLflow", + "description": "Import Registered Models, Model Versions, and Model Stages from MLflow.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mlflow/", "recipe": "source:\n type: mlflow\n config:\n tracking_uri: tracking_uri" }, - { - "urn": "urn:li:dataPlatform:mode", - "name": "mode", - "displayName": "Mode", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/mode/", - "recipe": "source:\n type: mode\n config:\n # Coordinates\n connect_uri: http://app.mode.com\n\n # Credentials\n token: token\n password: pass\n\n # Options\n workspace: \"datahub\"\n default_schema: \"public\"\n owner_username_instead_of_email: False\n api_options:\n retry_backoff_multiplier: 2\n max_retry_interval: 10\n max_attempts: 5" - }, { "urn": "urn:li:dataPlatform:azure-ad", "name": "azure-ad", "displayName": "Azure AD", + "description": "Import Users and Groups from Azure Active Directory.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/azure-ad/", "recipe": "source:\n type: azure-ad\n config:\n client_id: # Your Azure Client ID, e.g. \"00000000-0000-0000-0000-000000000000\"\n tenant_id: # Your Azure Tenant ID, e.g. \"00000000-0000-0000-0000-000000000000\"\n # Add secret in Secrets Tab with this name\n client_secret: \n redirect: # Your Redirect URL, e.g. \"https://login.microsoftonline.com/common/oauth2/nativeclient\"\n authority: # Your Authority URL, e.g. \"https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000\"\n token_url: # Your Token URL, e.g. \"https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000/oauth2/token\"\n graph_url: # The Graph URL, e.g. \"https://graph.microsoft.com/v1.0\"\n \n # Optional flags to ingest users, groups, or both\n ingest_users: True\n ingest_groups: True\n \n # Optional Allow / Deny extraction of particular Groups\n # groups_pattern:\n # allow:\n # - \".*\"\n\n # Optional Allow / Deny extraction of particular Users.\n # users_pattern:\n # allow:\n # - \".*\"" }, @@ -206,6 +235,7 @@ "urn": "urn:li:dataPlatform:okta", "name": "okta", "displayName": "Okta", + "description": "Import Users and Groups from Okta.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/okta/", "recipe": "source:\n type: okta\n config:\n # Coordinates\n okta_domain: # Your Okta Domain, e.g. \"dev-35531955.okta.com\"\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n okta_api_token: # Your Okta API Token, e.g. \"11be4R_M2MzDqXawbTHfKGpKee0kuEOfX1RCQSRx99\"\n\n # Optional flags to ingest users, groups, or both\n ingest_users: True\n ingest_groups: True\n\n # Optional: Customize the mapping to DataHub Username from an attribute appearing in the Okta User\n # profile. Reference: https://developer.okta.com/docs/reference/api/users/\n # okta_profile_to_username_attr: str = \"login\"\n # okta_profile_to_username_regex: str = \"([^@]+)\"\n \n # Optional: Customize the mapping to DataHub Group from an attribute appearing in the Okta Group\n # profile. Reference: https://developer.okta.com/docs/reference/api/groups/\n # okta_profile_to_group_name_attr: str = \"name\"\n # okta_profile_to_group_name_regex: str = \"(.*)\"\n \n # Optional: Include deprovisioned or suspended Okta users in the ingestion.\n # include_deprovisioned_users = False\n # include_suspended_users = False" }, @@ -213,6 +243,7 @@ "urn": "urn:li:dataPlatform:vertica", "name": "vertica", "displayName": "Vertica", + "description": "Import Databases, Schemas, Tables, Views, Projections, statistics, and lineage from Vertica.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertica/", "recipe": "source:\n type: vertica\n config:\n # Coordinates\n host_port: localhost:5433\n # The name of the vertica database\n database: Database_Name\n # Credentials\n username: Vertica_User\n password: Vertica_Password\n\n include_tables: true\n include_views: true\n include_projections: true\n include_models: true\n include_view_lineage: true\n include_projection_lineage: true\n profiling:\n enabled: false\n stateful_ingestion:\n enabled: true " }, @@ -220,42 +251,48 @@ "urn": "urn:li:dataPlatform:fivetran", "name": "fivetran", "displayName": "Fivetran", + "description": "Import Connectors, Destinations, Sync Histor, Users, and lineage from FiveTran.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/fivetran/", "recipe": "source:\n type: fivetran\n config:\n # Fivetran log connector destination server configurations\n fivetran_log_config:\n destination_platform: snowflake\n snowflake_destination_config:\n # Coordinates\n account_id: snowflake_account_id\n warehouse: warehouse_name\n database: snowflake_db\n log_schema: fivetran_log_schema\n\n # Credentials\n username: ${SNOWFLAKE_USER}\n password: ${SNOWFLAKE_PASS}\n role: snowflake_role\n\n # Optional - filter for certain connector names instead of ingesting everything.\n # connector_patterns:\n # allow:\n # - connector_name\n\n # Optional -- This mapping is optional and only required to configure platform-instance for source\n # A mapping of Fivetran connector id to data platform instance\n # sources_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV\n\n # Optional -- This mapping is optional and only required to configure platform-instance for destination.\n # A mapping of Fivetran destination id to data platform instance\n # destination_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV" }, { - "urn": "urn:li:dataPlatform:csv-enricher", - "name": "csv-enricher", - "displayName": "CSV", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv'", - "recipe": "source: \n type: csv-enricher \n config: \n # URL of your csv file to ingest \n filename: \n array_delimiter: '|' \n delimiter: ',' \n write_semantics: PATCH" - }, - { - "urn": "urn:li:dataPlatform:custom", - "name": "custom", - "displayName": "Other", - "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", - "recipe": "source:\n type: \n config:\n # Source-type specifics config\n " + "urn": "urn:li:dataPlatform:sigma", + "name": "sigma", + "displayName": "Sigma", + "description": "Import Workspaces, Workbooks, Pages, Elements, and lineage from Sigma Computing.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/sigma/", + "recipe": "source:\n type: sigma\n config:\n # Coordinates\n api_url: https://aws-api.sigmacomputing.com/v2\n # Coordinates\n client_id: CLIENT_ID\n client_secret: CLIENT_SECRET\n\n # Optional - filter for certain workspace names instead of ingesting everything.\n # workspace_pattern:\n\n # allow:\n # - workspace_name\n ingest_owner: true" }, { "urn": "urn:li:dataPlatform:qlik-sense", "name": "qlik-sense", "displayName": "Qlik Sense", + "description": "Import Spaces, Apps, Sheets, Charts, and Datasets from Qlik Sense.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/qlik-sense/", "recipe": "source:\n type: qlik-sense\n config:\n # Coordinates\n tenant_hostname: https://xyz12xz.us.qlikcloud.com\n # Coordinates\n api_key: QLIK_API_KEY\n\n # Optional - filter for certain space names instead of ingesting everything.\n # space_pattern:\n\n # allow:\n # - space_name\n ingest_owner: true" }, - { - "urn": "urn:li:dataPlatform:sigma", - "name": "sigma", - "displayName": "Sigma", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/sigma/", - "recipe": "source:\n type: sigma\n config:\n # Coordinates\n api_url: https://aws-api.sigmacomputing.com/v2\n # Coordinates\n client_id: CLIENT_ID\n client_secret: CLIENT_SECRET\n\n # Optional - filter for certain workspace names instead of ingesting everything.\n # workspace_pattern:\n\n # allow:\n # - workspace_name\n ingest_owner: true" - }, { "urn": "urn:li:dataPlatform:cockroachdb", "name": "cockroachdb", "displayName": "CockroachDb", + "description": "Import Databases, Schemas, Tables, Views, statistics and lineage from CockroachDB.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cockroachdb/", "recipe": "source: \n type: cockroachdb\n config:\n # Coordinates\n host_port: # Your CockroachDb host and port, e.g. cockroachdb:5432\n database: # Your CockroachDb Database, e.g. sample_db\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your CockroachDb username, e.g. admin\n\n # Options\n include_tables: true\n include_views: true\n\n # Profiling\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" + }, + { + "urn": "urn:li:dataPlatform:csv-enricher", + "name": "csv-enricher", + "displayName": "CSV", + "description": "Import metadata from a formatted CSV.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv'", + "recipe": "source: \n type: csv-enricher \n config: \n # URL of your csv file to ingest \n filename: \n array_delimiter: '|' \n delimiter: ',' \n write_semantics: PATCH" + }, + { + "urn": "urn:li:dataPlatform:custom", + "name": "custom", + "displayName": "Other", + "description": "Configure a custom recipe using YAML.", + "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", + "recipe": "source:\n type: \n config:\n # Source-type specifics config\n " } ] diff --git a/datahub-web-react/src/app/ingest/source/builder/types.ts b/datahub-web-react/src/app/ingest/source/builder/types.ts index 2df467b7beba1f..e42bd0b790b2c8 100644 --- a/datahub-web-react/src/app/ingest/source/builder/types.ts +++ b/datahub-web-react/src/app/ingest/source/builder/types.ts @@ -18,6 +18,7 @@ export interface SourceConfig { name: string; displayName: string; docsUrl: string; + description?: string; recipe: string; } diff --git a/datahub-web-react/src/app/shared/form/RequiredFieldForm.tsx b/datahub-web-react/src/app/shared/form/RequiredFieldForm.tsx new file mode 100644 index 00000000000000..d35af17e1b1ce6 --- /dev/null +++ b/datahub-web-react/src/app/shared/form/RequiredFieldForm.tsx @@ -0,0 +1,14 @@ +import { Form } from 'antd'; +import styled from 'styled-components'; + +const DEFAULT_ASTERICK_COLOR = '#F5222D'; + +export const RequiredFieldForm = styled(Form)<{ requiredColor?: string }>` + && { + .ant-form-item-label > label.ant-form-item-required::before { + color: ${(props) => + props.requiredColor || DEFAULT_ASTERICK_COLOR}; /* Change 'red' to any color you prefer */ + content: '*'; /* Ensure the asterisk is always used */ + } + } +`; diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js b/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js index 8f50262b41d2c2..470f9e2eec4617 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js @@ -13,7 +13,7 @@ describe("ingestion source creation flow", () => { cy.goToIngestionPage(); cy.clickOptionWithTestId("create-ingestion-source-button"); cy.clickOptionWithText("Snowflake"); - cy.waitTextVisible("Snowflake Recipe"); + cy.waitTextVisible("Snowflake Details"); cy.get("#account_id").type(accound_id); cy.get("#warehouse").type(warehouse_id); cy.get("#username").type(username); @@ -34,7 +34,7 @@ describe("ingestion source creation flow", () => { cy.clickOptionWithTestId("recipe-builder-next-button"); cy.waitTextVisible("Configure an Ingestion Schedule"); cy.clickOptionWithTestId("ingestion-schedule-next-button"); - cy.waitTextVisible("Give this ingestion source a name."); + cy.waitTextVisible("Give this data source a name"); cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); cy.clickOptionWithTestId("ingestion-source-save-button"); cy.waitTextVisible("Successfully created ingestion source!").wait(5000); @@ -47,7 +47,7 @@ describe("ingestion source creation flow", () => { cy.get('[data-testid="ingestion-source-table-edit-button"]') .first() .click(); - cy.waitTextVisible("Edit Ingestion Source"); + cy.waitTextVisible("Edit Data Source"); cy.get("#account_id").should("have.value", accound_id); cy.get("#warehouse").should("have.value", warehouse_id); cy.get("#username").should("have.value", username); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js index d23b0ca7523b8a..d01c762401c2e6 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js @@ -1,40 +1,44 @@ -function readyToTypeEditor() { - return cy.get(".monaco-editor textarea:first").click().focused(); -} - -describe("run managed ingestion", () => { - it("create run managed ingestion source", () => { - const number = Math.floor(Math.random() * 100000); - const testName = `cypress test source ${number}`; - const cli_version = "0.12.0"; - cy.login(); - cy.goToIngestionPage(); - cy.clickOptionWithText("Create new source"); - cy.clickOptionWithTextToScrollintoView("Other"); - - cy.waitTextVisible("source-type"); - readyToTypeEditor().type("{ctrl}a").clear(); - readyToTypeEditor().type("source:{enter}"); - readyToTypeEditor().type(" type: demo-data"); - readyToTypeEditor().type("{enter}"); - // no space because the editor starts new line at same indentation - readyToTypeEditor().type("config: {}"); - cy.clickOptionWithText("Next"); - cy.clickOptionWithText("Next"); - - cy.enterTextInTestId("source-name-input", testName); - cy.clickOptionWithText("Advanced"); - cy.enterTextInTestId("cli-version-input", cli_version); - cy.clickOptionWithTextToScrollintoView("Save & Run"); - cy.waitTextVisible(testName); - - cy.contains(testName) - .parent() - .within(() => { - cy.contains("Succeeded", { timeout: 180000 }); - cy.clickOptionWithTestId("delete-button"); - }); - cy.clickOptionWithText("Yes"); - cy.ensureTextNotPresent(testName); - }); -}); +// TODO: Investigate why this test can never pass on CI, but passes locally after PR #21465 +// +// function readyToTypeEditor() { +// return cy.get(".monaco-editor textarea:first").click().focused(); +// } +// +// describe("run managed ingestion", () => { +// it("create run managed ingestion source", () => { +// const number = Math.floor(Math.random() * 100000); +// const testName = `cypress test source ${number}`; +// const cli_version = "0.12.0"; +// cy.login(); +// cy.goToIngestionPage(); +// cy.clickOptionWithText("Create new source"); +// cy.clickOptionWithTextToScrollintoView("Other"); +// +// cy.waitTextVisible("source-type"); +// cy.wait(10000); // waits for 5 seconds +// +// readyToTypeEditor().type("{ctrl}a").clear({ force: true }); +// readyToTypeEditor().type("source:{enter}", { force: true }); +// readyToTypeEditor().type(" type: demo-data", { force: true }); +// readyToTypeEditor().type("{enter}", { force: true }); +// // no space because the editor starts new line at same indentation +// readyToTypeEditor().type("config: {}", { force: true }); +// cy.clickOptionWithText("Next"); +// cy.clickOptionWithText("Next"); +// +// cy.enterTextInTestId("source-name-input", testName); +// cy.clickOptionWithText("Advanced"); +// cy.enterTextInTestId("cli-version-input", cli_version); +// cy.clickOptionWithTextToScrollintoView("Save & Run"); +// cy.waitTextVisible(testName); +// +// cy.contains(testName) +// .parent() +// .within(() => { +// cy.contains("Succeeded", { timeout: 180000 }); +// cy.clickOptionWithTestId("delete-button"); +// }); +// cy.clickOptionWithText("Yes"); +// cy.ensureTextNotPresent(testName); +// }); +// }); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js index 57eccc32110966..1d95c1533c93c2 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js @@ -30,7 +30,7 @@ describe("managing secrets for ingestion creation", () => { cy.goToIngestionPage(); cy.get("#ingestion-create-source").click(); cy.clickOptionWithText("Snowflake"); - cy.waitTextVisible("Snowflake Recipe"); + cy.waitTextVisible("Snowflake Details"); cy.get("#account_id").type(accound_id); cy.get("#warehouse").type(warehouse_id); cy.get("#username").type(username); @@ -41,7 +41,7 @@ describe("managing secrets for ingestion creation", () => { cy.get("button").contains("Next").click(); cy.waitTextVisible("Configure an Ingestion Schedule"); cy.get("button").contains("Next").click(); - cy.waitTextVisible("Give this ingestion source a name."); + cy.waitTextVisible("Give this data source a name"); cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); cy.get("button").contains("Save").click(); cy.waitTextVisible("Successfully created ingestion source!").wait(5000); @@ -69,7 +69,7 @@ describe("managing secrets for ingestion creation", () => { // Verify secret is not present during ingestion source creation for password dropdown cy.clickOptionWithText("Create new source"); cy.clickOptionWithText("Snowflake"); - cy.waitTextVisible("Snowflake Recipe"); + cy.waitTextVisible("Snowflake Details"); cy.get("#account_id").type(accound_id); cy.get("#warehouse").type(warehouse_id); cy.get("#username").type(username); @@ -90,7 +90,7 @@ describe("managing secrets for ingestion creation", () => { cy.get("button").contains("Next").click(); cy.waitTextVisible("Configure an Ingestion Schedule"); cy.get("button").contains("Next").click(); - cy.waitTextVisible("Give this ingestion source a name."); + cy.waitTextVisible("Give this data source a name"); cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); cy.get("button").contains("Save").click(); cy.waitTextVisible("Successfully created ingestion source!").wait(5000); // prevent issue with missing form data diff --git a/smoke-test/tests/cypress/cypress/support/commands.js b/smoke-test/tests/cypress/cypress/support/commands.js index b6aeccfeb81a51..9b5065a7bdccfc 100644 --- a/smoke-test/tests/cypress/cypress/support/commands.js +++ b/smoke-test/tests/cypress/cypress/support/commands.js @@ -107,7 +107,7 @@ Cypress.Commands.add("goToAccessTokenSettings", () => { Cypress.Commands.add("goToIngestionPage", () => { cy.visit("/ingestion"); - cy.waitTextVisible("Manage Ingestion"); + cy.waitTextVisible("Manage Data Sources"); }); Cypress.Commands.add("goToDataset", (urn, dataset_name) => { @@ -198,6 +198,21 @@ Cypress.Commands.add("clickOptionWithTextToScrollintoView", (text) => { cy.contains(text).scrollIntoView().click(); }); +Cypress.Commands.add("clickOptionInScrollView", (text, selector) => { + cy.get(selector).within(() => { + cy.contains(text).then((el) => { + // Scroll the element into view with options for better alignment + el[0].scrollIntoView({ block: "center", inline: "nearest" }); + + // Wrap the element for further chaining with Cypress commands + cy.wrap(el) + .should("be.visible") // Wait until the element is visible + .should("not.be.disabled") // Ensure the element is not disabled + .click({ force: true }); // Force click if necessary + }); + }); +}); + Cypress.Commands.add("deleteFromDropdown", () => { cy.openThreeDotDropdown(); cy.clickOptionWithText("Delete"); From 2e496d599af4254b7d3ebbf08143a73e48e53839 Mon Sep 17 00:00:00 2001 From: dushayntAW <158567391+dushayntAW@users.noreply.github.com> Date: Tue, 2 Jul 2024 11:59:16 +0200 Subject: [PATCH 31/33] fix(ingestion/airflow-plugin): pipeline tasks discoverable in search (#10819) --- .../datahub_listener.py | 17 +- .../integration/goldens/v2_basic_iolets.json | 22 ++ .../v2_basic_iolets_no_dag_listener.json | 22 ++ .../integration/goldens/v2_simple_dag.json | 33 +++ .../v2_simple_dag_no_dag_listener.json | 33 +++ .../goldens/v2_snowflake_operator.json | 22 ++ .../goldens/v2_sqlite_operator.json | 66 +++++ .../v2_sqlite_operator_no_dag_listener.json | 267 +++++++++++------- 8 files changed, 373 insertions(+), 109 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index 53f1dde48cb3fd..6ef4f831522cb9 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -536,14 +536,27 @@ def on_dag_start(self, dag_run: "DagRun") -> None: ) dataflow.emit(self.emitter, callback=self._make_emit_callback()) + event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( + entityUrn=str(dataflow.urn), aspect=StatusClass(removed=False) + ) + self.emitter.emit(event) + + for task in dag.tasks: + task_urn = builder.make_data_job_urn_with_flow( + str(dataflow.urn), task.task_id + ) + event = MetadataChangeProposalWrapper( + entityUrn=task_urn, aspect=StatusClass(removed=False) + ) + self.emitter.emit(event) + # emit tags for tag in dataflow.tags: tag_urn = builder.make_tag_urn(tag) - event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( + event = MetadataChangeProposalWrapper( entityUrn=tag_urn, aspect=StatusClass(removed=False) ) - self.emitter.emit(event) browse_path_v2_event: MetadataChangeProposalWrapper = ( diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json index 1ff53a45abf399..e2f738d19d89a9 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json @@ -57,6 +57,28 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json index ef55850894bf61..7b1591bdf7308b 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json @@ -57,6 +57,17 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", @@ -72,6 +83,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json index 48f8872d3831fa..0828c5e5aa1f7c 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json @@ -58,6 +58,28 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", @@ -102,6 +124,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json index 434f4b6eb7a09c..c7cd245cc0f02a 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json @@ -58,6 +58,17 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", @@ -73,6 +84,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", @@ -102,6 +124,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json index 89464448032b12..b819379395a040 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json @@ -57,6 +57,28 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json index b4b97bbb74fae8..e7902d165051b1 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json @@ -57,6 +57,28 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", @@ -102,6 +124,39 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", @@ -155,6 +210,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json index d09d4f76e3f02d..a9af068e2e4e93 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json @@ -57,6 +57,28 @@ } } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", @@ -72,6 +94,28 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", @@ -154,19 +198,6 @@ } } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetKey", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:sqlite", - "name": "public.costs", - "origin": "PROD" - } - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", @@ -191,6 +222,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", @@ -204,6 +246,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", @@ -261,23 +314,6 @@ } } }, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceRunEvent", - "aspect": { - "json": { - "timestampMillis": 1717180072004, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, - "status": "STARTED", - "attempt": 1 - } - } -}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", @@ -292,20 +328,19 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", "changeType": "UPSERT", - "aspectName": "operation", + "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1718780495946, + "timestampMillis": 1717180072004, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" }, - "actor": "urn:li:corpuser:airflow", - "operationType": "CREATE", - "lastUpdatedTimestamp": 1718780495946 + "status": "STARTED", + "attempt": 1 } } }, @@ -338,6 +373,24 @@ } } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "operation", + "aspect": { + "json": { + "timestampMillis": 1719864194882, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "actor": "urn:li:corpuser:airflow", + "operationType": "CREATE", + "lastUpdatedTimestamp": 1719864194882 + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", @@ -680,19 +733,6 @@ } } }, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceOutput", - "aspect": { - "json": { - "outputs": [ - "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" - ] - } - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", @@ -892,6 +932,19 @@ } } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", @@ -1039,19 +1092,6 @@ } } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetKey", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:sqlite", - "name": "public.processed_costs", - "origin": "PROD" - } - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", @@ -1192,32 +1232,6 @@ } } }, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceInput", - "aspect": { - "json": { - "inputs": [ - "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" - ] - } - } -}, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceOutput", - "aspect": { - "json": { - "outputs": [ - "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" - ] - } - } -}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", @@ -1281,24 +1295,6 @@ } } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", - "changeType": "UPSERT", - "aspectName": "operation", - "aspect": { - "json": { - "timestampMillis": 1718780501750, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, - "actor": "urn:li:corpuser:airflow", - "operationType": "CREATE", - "lastUpdatedTimestamp": 1718780501750 - } - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", @@ -1383,6 +1379,19 @@ } } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:sqlite", + "name": "public.processed_costs", + "origin": "PROD" + } + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", @@ -1470,6 +1479,19 @@ } } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", @@ -1488,6 +1510,19 @@ } } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", @@ -1586,6 +1621,24 @@ } } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "operation", + "aspect": { + "json": { + "timestampMillis": 1719864203487, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "actor": "urn:li:corpuser:airflow", + "operationType": "CREATE", + "lastUpdatedTimestamp": 1719864203487 + } + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", From 640d42dc65b8a6be497af5eec12b810b36a73135 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Tue, 2 Jul 2024 15:30:05 +0530 Subject: [PATCH 32/33] feat(ingest/transformer): tags to terms transformer (#10758) Co-authored-by: Aseem Bansal --- .../docs/transformer/dataset_transformer.md | 53 ++++- metadata-ingestion/setup.py | 1 + .../src/datahub/ingestion/graph/client.py | 14 ++ .../transformer/dataset_transformer.py | 15 ++ .../ingestion/transformer/tags_to_terms.py | 145 ++++++++++++ .../tests/unit/test_transform_dataset.py | 222 +++++++++++++++++- 6 files changed, 446 insertions(+), 4 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index 772a638b6a9487..773a7e8554832d 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -10,7 +10,7 @@ The below table shows transformer which can transform aspects of entity [Dataset | `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)
- [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)
- [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership)
- [Extract Ownership from Tags](#extract-ownership-from-tags)
- [Clean suffix prefix from Ownership](#clean-suffix-prefix-from-ownership) | | `globalTags` | - [Simple Add Dataset globalTags ](#simple-add-dataset-globaltags)
- [Pattern Add Dataset globalTags](#pattern-add-dataset-globaltags)
- [Add Dataset globalTags](#add-dataset-globaltags) | | `browsePaths` | - [Set Dataset browsePath](#set-dataset-browsepath) | -| `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms) | +| `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms)
- [Tags to Term Mapping](#tags-to-term-mapping) | | `schemaMetadata` | - [Pattern Add Dataset Schema Field glossaryTerms](#pattern-add-dataset-schema-field-glossaryterms)
- [Pattern Add Dataset Schema Field globalTags](#pattern-add-dataset-schema-field-globaltags) | | `datasetProperties` | - [Simple Add Dataset datasetProperties](#simple-add-dataset-datasetproperties)
- [Add Dataset datasetProperties](#add-dataset-datasetproperties) | | `domains` | - [Simple Add Dataset domains](#simple-add-dataset-domains)
- [Pattern Add Dataset domains](#pattern-add-dataset-domains)
- [Domain Mapping Based on Tags](#domain-mapping-based-on-tags) | @@ -668,6 +668,57 @@ We can add glossary terms to datasets based on a regex filter. ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] ``` + +## Tags to Term Mapping +### Config Details + +| Field | Required | Type | Default | Description | +|---------------|----------|--------------------|-------------|-------------------------------------------------------------------------------------------------------| +| `tags` | ✅ | List[str] | | List of tag names based on which terms will be created and associated with the dataset. | +| `semantics` | | enum | "OVERWRITE" | Determines whether to OVERWRITE or PATCH the terms associated with the dataset on DataHub GMS. | + +
+ +The `tags_to_term` transformer is designed to map specific tags to glossary terms within DataHub. It takes a configuration of tags that should be translated into corresponding glossary terms. This transformer can apply these mappings to any tags found either at the column level of a dataset or at the dataset top level. + +When specifying tags in the configuration, use the tag's simple name rather than the full tag URN. + +For example, instead of using the tag URN `urn:li:tag:snowflakedb.snowflakeschema.tag_name:tag_value`, you should specify just the tag name `tag_name` in the mapping configuration. + +```yaml +transformers: + - type: "tags_to_term" + config: + semantics: OVERWRITE # OVERWRITE is the default behavior + tags: + - "tag_name" +``` + +The `tags_to_term` transformer can be configured in the following ways: + +- Add terms based on tags, however overwrite the terms available for the dataset on DataHub GMS +```yaml + transformers: + - type: "tags_to_term" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + tags: + - "example1" + - "example2" + - "example3" + ``` +- Add terms based on tags, however keep the terms available for the dataset on DataHub GMS +```yaml + transformers: + - type: "tags_to_term" + config: + semantics: PATCH + tags: + - "example1" + - "example2" + - "example3" + ``` + ## Pattern Add Dataset Schema Field glossaryTerms ### Config Details | Field | Required | Type | Default | Description | diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e8508a6e7c827c..cd4ed37d110bd2 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -715,6 +715,7 @@ "replace_external_url = datahub.ingestion.transformer.replace_external_url:ReplaceExternalUrl", "pattern_cleanup_dataset_usage_user = datahub.ingestion.transformer.pattern_cleanup_dataset_usage_user:PatternCleanupDatasetUsageUser", "domain_mapping_based_on_tags = datahub.ingestion.transformer.dataset_domain_based_on_tags:DatasetTagDomainMapper", + "tags_to_term = datahub.ingestion.transformer.tags_to_terms:TagsToTermMapper", ], "datahub.ingestion.sink.plugins": [ "file = datahub.ingestion.sink.file:FileSink", diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 252846326b49e1..7ba412b3e772c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -1278,6 +1278,20 @@ def create_tag(self, tag_name: str) -> str: # return urn return res["createTag"] + def remove_tag(self, tag_urn: str, resource_urn: str) -> bool: + graph_query = f""" + mutation removeTag {{ + removeTag( + input: {{ + tagUrn: "{tag_urn}", + resourceUrn: "{resource_urn}" + }}) + }} + """ + + res = self.execute_graphql(query=graph_query) + return res["removeTag"] + def _assertion_result_shared(self) -> str: fragment: str = """ fragment assertionResult on AssertionResult { diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py index a78a79141e8e42..3e313ddd356be7 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py @@ -27,6 +27,16 @@ def entity_types(self) -> List[str]: return ["dataset"] +class TagTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): + """Transformer that does transform sequentially on each tag.""" + + def __init__(self): + super().__init__() + + def entity_types(self) -> List[str]: + return ["dataset", "container"] + + class DatasetOwnershipTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "ownership" @@ -128,3 +138,8 @@ def aspect_name(self) -> str: class DatasetUsageStatisticsTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "datasetUsageStatistics" + + +class TagsToTermTransformer(TagTransformer, metaclass=ABCMeta): + def aspect_name(self) -> str: + return "glossaryTerms" diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py new file mode 100644 index 00000000000000..338f191c0829df --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py @@ -0,0 +1,145 @@ +from typing import List, Optional, Set, cast + +import datahub.emitter.mce_builder as builder +from datahub.configuration.common import ( + TransformerSemantics, + TransformerSemanticsConfigModel, +) +from datahub.emitter.mce_builder import Aspect, make_term_urn +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.transformer.dataset_transformer import TagsToTermTransformer +from datahub.metadata.schema_classes import ( + AuditStampClass, + GlobalTagsClass, + GlossaryTermAssociationClass, + GlossaryTermsClass, + SchemaMetadataClass, +) + + +class TagsToTermMapperConfig(TransformerSemanticsConfigModel): + tags: List[str] + + +class TagsToTermMapper(TagsToTermTransformer): + """This transformer maps specified tags to corresponding glossary terms for a dataset.""" + + def __init__(self, config: TagsToTermMapperConfig, ctx: PipelineContext): + super().__init__() + self.ctx: PipelineContext = ctx + self.config: TagsToTermMapperConfig = config + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "TagsToTermMapper": + config = TagsToTermMapperConfig.parse_obj(config_dict) + return cls(config, ctx) + + @staticmethod + def _merge_with_server_glossary_terms( + graph: DataHubGraph, + urn: str, + glossary_terms_aspect: Optional[GlossaryTermsClass], + ) -> Optional[GlossaryTermsClass]: + if not glossary_terms_aspect or not glossary_terms_aspect.terms: + # nothing to add, no need to consult server + return None + + # Merge the transformed terms with existing server terms. + # The transformed terms takes precedence, which may change the term context. + server_glossary_terms_aspect = graph.get_glossary_terms(entity_urn=urn) + if server_glossary_terms_aspect is not None: + glossary_terms_aspect.terms = list( + { + **{term.urn: term for term in server_glossary_terms_aspect.terms}, + **{term.urn: term for term in glossary_terms_aspect.terms}, + }.values() + ) + + return glossary_terms_aspect + + @staticmethod + def get_tags_from_global_tags(global_tags: Optional[GlobalTagsClass]) -> Set[str]: + """Extracts tags urn from GlobalTagsClass.""" + if not global_tags or not global_tags.tags: + return set() + + return {tag_assoc.tag for tag_assoc in global_tags.tags} + + @staticmethod + def get_tags_from_schema_metadata( + schema_metadata: Optional[SchemaMetadataClass], + ) -> Set[str]: + """Extracts globalTags from all fields in SchemaMetadataClass.""" + if not schema_metadata or not schema_metadata.fields: + return set() + tags = set() + for field in schema_metadata.fields: + if field.globalTags: + tags.update( + TagsToTermMapper.get_tags_from_global_tags(field.globalTags) + ) + return tags + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + + in_glossary_terms: Optional[GlossaryTermsClass] = cast( + Optional[GlossaryTermsClass], aspect + ) + + assert self.ctx.graph + in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags( + entity_urn + ) + in_schema_metadata_aspect: Optional[ + SchemaMetadataClass + ] = self.ctx.graph.get_schema_metadata(entity_urn) + + if in_global_tags_aspect is None and in_schema_metadata_aspect is None: + return cast(Aspect, in_glossary_terms) + + global_tags = TagsToTermMapper.get_tags_from_global_tags(in_global_tags_aspect) + schema_metadata_tags = TagsToTermMapper.get_tags_from_schema_metadata( + in_schema_metadata_aspect + ) + + # Combine tags from both global and schema level + combined_tags = global_tags.union(schema_metadata_tags) + + tag_set = set(self.config.tags) + terms_to_add = set() + tags_to_delete = set() + + # Check each global tag against the configured tag list and prepare terms + for full_tag in combined_tags: + tag_name = full_tag.split("urn:li:tag:")[-1].split(".")[-1].split(":")[0] + if tag_name in tag_set: + term_urn = make_term_urn(tag_name) + terms_to_add.add(term_urn) + tags_to_delete.add(full_tag) # Full URN for deletion + + if not terms_to_add: + return cast(Aspect, in_glossary_terms) # No new terms to add + + for tag_urn in tags_to_delete: + self.ctx.graph.remove_tag(tag_urn=tag_urn, resource_urn=entity_urn) + + # Initialize the Glossary Terms properly + out_glossary_terms = GlossaryTermsClass( + terms=[GlossaryTermAssociationClass(urn=term) for term in terms_to_add], + auditStamp=AuditStampClass( + time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter" + ), + ) + + if self.config.semantics == TransformerSemantics.PATCH: + patch_glossary_terms: Optional[ + GlossaryTermsClass + ] = TagsToTermMapper._merge_with_server_glossary_terms( + self.ctx.graph, entity_urn, out_glossary_terms + ) + return cast(Optional[Aspect], patch_glossary_terms) + else: + return cast(Aspect, out_glossary_terms) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index a0deae972badb4..4170fb5bf8b678 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -70,7 +70,10 @@ from datahub.ingestion.transformer.dataset_domain_based_on_tags import ( DatasetTagDomainMapper, ) -from datahub.ingestion.transformer.dataset_transformer import DatasetTransformer +from datahub.ingestion.transformer.dataset_transformer import ( + DatasetTransformer, + TagTransformer, +) from datahub.ingestion.transformer.extract_dataset_tags import ExtractDatasetTags from datahub.ingestion.transformer.extract_ownership_from_tags import ( ExtractOwnersFromTagsTransformer, @@ -86,6 +89,7 @@ SimpleRemoveDatasetOwnership, ) from datahub.ingestion.transformer.replace_external_url import ReplaceExternalUrl +from datahub.ingestion.transformer.tags_to_terms import TagsToTermMapper from datahub.metadata.schema_classes import ( BrowsePathsClass, DatasetPropertiesClass, @@ -1891,12 +1895,14 @@ def test_pattern_dataset_schema_tags_transformation(mock_time): def run_dataset_transformer_pipeline( - transformer_type: Type[DatasetTransformer], + transformer_type: Type[Union[DatasetTransformer, TagTransformer]], aspect: Optional[builder.Aspect], config: dict, - pipeline_context: PipelineContext = PipelineContext(run_id="transformer_pipe_line"), + pipeline_context: Optional[PipelineContext] = None, use_mce: bool = False, ) -> List[RecordEnvelope]: + if pipeline_context is None: + pipeline_context = PipelineContext(run_id="transformer_pipe_line") transformer: DatasetTransformer = cast( DatasetTransformer, transformer_type.create(config, pipeline_context) ) @@ -3651,3 +3657,213 @@ def fake_get_tags(entity_urn: str) -> Optional[models.GlobalTagsClass]: assert len(transformed_aspect.domains) == 1 assert acryl_domain in transformed_aspect.domains assert server_domain not in transformed_aspect.domains + + +def test_tags_to_terms_transformation(mock_datahub_graph): + # Create domain URNs for the test + term_urn_example1 = builder.make_term_urn("example1") + term_urn_example2 = builder.make_term_urn("example2") + + def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass( + tags=[ + TagAssociationClass(tag=builder.make_tag_urn("example1")), + TagAssociationClass(tag=builder.make_tag_urn("example2")), + ] + ) + + # fake the server response + def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: + return models.SchemaMetadataClass( + schemaName="customer", # not used + platform=builder.make_data_platform_urn( + "hive" + ), # important <- platform must be an urn + version=0, + # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 + hash="", + # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="first_name", + globalTags=models.GlobalTagsClass( + tags=[ + models.TagAssociationClass( + tag=builder.make_tag_urn("example2") + ) + ], + ), + glossaryTerms=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass( + urn=builder.make_term_urn("pii") + ) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="mobile_number", + glossaryTerms=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass( + urn=builder.make_term_urn("pii") + ) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_tags # type: ignore + pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore + + # Configuring the transformer + config = {"tags": ["example1", "example2"]} + + # Running the transformer within a test pipeline + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Expected results + expected_terms = [term_urn_example2, term_urn_example1] + + # Verify the output + assert len(output) == 2 # One for result and one for end of stream + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == len(expected_terms) + assert set(term.urn for term in terms_aspect.terms) == { + "urn:li:glossaryTerm:example1", + "urn:li:glossaryTerm:example2", + } + + +def test_tags_to_terms_with_no_matching_terms(mock_datahub_graph): + # Setup for test where no tags match the provided term mappings + def fake_get_tags_no_match(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass( + tags=[ + TagAssociationClass(tag=builder.make_tag_urn("nonMatchingTag1")), + TagAssociationClass(tag=builder.make_tag_urn("nonMatchingTag2")), + ] + ) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_tags_no_match # type: ignore + + # No matching terms in config + config = {"tags": ["example1", "example2"]} + + # Running the transformer within a test pipeline + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Verify the output + assert len(output) == 2 # One for result and one for end of stream + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == 1 + + +def test_tags_to_terms_with_missing_tags(mock_datahub_graph): + # Setup for test where no tags are present + def fake_get_no_tags(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass(tags=[]) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_no_tags # type: ignore + + config = {"tags": ["example1", "example2"]} + + # Running the transformer with no tags + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Verify that no terms are added when there are no tags + assert len(output) == 2 + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == 1 + + +def test_tags_to_terms_with_partial_match(mock_datahub_graph): + # Setup for partial match scenario + def fake_get_partial_match_tags(entity_urn: str) -> models.GlobalTagsClass: + return models.GlobalTagsClass( + tags=[ + TagAssociationClass( + tag=builder.make_tag_urn("example1") + ), # Should match + TagAssociationClass( + tag=builder.make_tag_urn("nonMatchingTag") + ), # No match + ] + ) + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph.get_tags = fake_get_partial_match_tags # type: ignore + + config = {"tags": ["example1"]} # Only 'example1' has a term mapped + + # Running the transformer with partial matching tags + output = run_dataset_transformer_pipeline( + transformer_type=TagsToTermMapper, + aspect=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass(urn=builder.make_term_urn("pii")) + ], + auditStamp=models.AuditStampClass._construct_with_defaults(), + ), + config=config, + pipeline_context=pipeline_context, + ) + + # Verify that only matched term is added + assert len(output) == 2 + terms_aspect = output[0].record.aspect + assert isinstance(terms_aspect, models.GlossaryTermsClass) + assert len(terms_aspect.terms) == 1 + assert terms_aspect.terms[0].urn == "urn:li:glossaryTerm:example1" From 40418d9184b0377bacf063de51f029bd91c6e17a Mon Sep 17 00:00:00 2001 From: dushayntAW <158567391+dushayntAW@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:01:43 +0200 Subject: [PATCH 33/33] fix(ingestion/unity-catalog): fixed issue with profiling with GE turned on (#10752) Co-authored-by: Aseem Bansal --- .../datahub/ingestion/source/unity/source.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 42ca9af7e8459a..b29170cb2d705d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -262,7 +262,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.report_ingestion_stage_start("Ingestion Setup") wait_on_warehouse = None - if self.config.is_profiling_enabled() or self.config.include_hive_metastore: + if self.config.include_hive_metastore: self.report.report_ingestion_stage_start("Start warehouse") # Can take several minutes, so start now and wait later wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() @@ -309,9 +309,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) if self.config.is_profiling_enabled(): - self.report.report_ingestion_stage_start("Wait on warehouse") - assert wait_on_warehouse - wait_on_warehouse.result() + self.report.report_ingestion_stage_start("Start warehouse") + # Need to start the warehouse again for profiling, + # as it may have been stopped after ingestion might take + # longer time to complete + wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() + if wait_on_warehouse is None: + self.report.report_failure( + "initialization", + f"SQL warehouse {self.config.profiling.warehouse_id} not found", + ) + return + else: + # wait until warehouse is started + wait_on_warehouse.result() self.report.report_ingestion_stage_start("Profiling") if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):