From a92230b32162dc26776210a3278eadaafaa6e08e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EA=B0=80=EC=9C=A4?= <60080153+KaYunKIM@users.noreply.github.com> Date: Thu, 14 Dec 2023 02:30:18 +0900 Subject: [PATCH 01/17] docs(ingest/tableau): add token to sink config in sample recipe (#9411) Co-authored-by: KaYunKIM Co-authored-by: Harshal Sheth --- metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml index ed6567b5889df1..a9db27bb52a233 100644 --- a/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml +++ b/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml @@ -18,3 +18,4 @@ sink: type: "datahub-rest" config: server: "http://localhost:8080" + token: token_value # optional From 3cde9549a290d2560d9eebaa4fc5a3521266a841 Mon Sep 17 00:00:00 2001 From: allizex <150264485+allizex@users.noreply.github.com> Date: Wed, 13 Dec 2023 20:26:45 +0100 Subject: [PATCH 02/17] feat(glossary): add ability to clone glossary term(name and documentation) from term profile menu (#9445) Co-authored-by: Olga Dimova <38855943+olgadimova@users.noreply.github.com> --- .../glossaryTerm/GlossaryTermEntity.tsx | 7 +++- .../CreateGlossaryEntityModal.tsx | 34 ++++++++++++++++--- .../shared/EntityDropdown/EntityDropdown.tsx | 22 ++++++++++++ .../src/app/entity/shared/types.ts | 1 + 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx index 080ee5889aec92..a6f6d9b0e28671 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx @@ -65,7 +65,12 @@ export class GlossaryTermEntity implements Entity { useEntityQuery={useGetGlossaryTermQuery as any} headerActionItems={new Set([EntityActionItem.BATCH_ADD_GLOSSARY_TERM])} headerDropdownItems={ - new Set([EntityMenuItems.UPDATE_DEPRECATION, EntityMenuItems.MOVE, EntityMenuItems.DELETE]) + new Set([ + EntityMenuItems.UPDATE_DEPRECATION, + EntityMenuItems.CLONE, + EntityMenuItems.MOVE, + EntityMenuItems.DELETE, + ]) } isNameEditable hideBrowseBar diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx index 9788d36af2c65a..d60e86b0af8ca4 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx @@ -1,8 +1,9 @@ -import React, { useState } from 'react'; +import React, { useEffect, useState } from 'react'; import styled from 'styled-components/macro'; import { EditOutlined } from '@ant-design/icons'; import { message, Button, Input, Modal, Typography, Form, Collapse } from 'antd'; import DOMPurify from 'dompurify'; +import { useHistory } from 'react-router'; import { useCreateGlossaryTermMutation, useCreateGlossaryNodeMutation, @@ -16,6 +17,7 @@ import DescriptionModal from '../components/legacy/DescriptionModal'; import { validateCustomUrnId } from '../../../shared/textUtil'; import { useGlossaryEntityData } from '../GlossaryEntityContext'; import { getGlossaryRootToUpdate, updateGlossarySidebar } from '../../../glossary/utils'; +import { getEntityPath } from '../containers/profile/utils'; const StyledItem = styled(Form.Item)` margin-bottom: 0; @@ -33,6 +35,7 @@ interface Props { entityType: EntityType; onClose: () => void; refetchData?: () => void; + isCloning?: boolean; } function CreateGlossaryEntityModal(props: Props) { @@ -43,15 +46,31 @@ function CreateGlossaryEntityModal(props: Props) { const entityRegistry = useEntityRegistry(); const [stagedId, setStagedId] = useState(undefined); const [stagedName, setStagedName] = useState(''); - const [selectedParentUrn, setSelectedParentUrn] = useState(entityData.urn); + const [selectedParentUrn, setSelectedParentUrn] = useState(props.isCloning ? '' : entityData.urn); const [documentation, setDocumentation] = useState(''); const [isDocumentationModalVisible, setIsDocumentationModalVisible] = useState(false); const [createButtonDisabled, setCreateButtonDisabled] = useState(true); const refetch = useRefetch(); + const history = useHistory(); const [createGlossaryTermMutation] = useCreateGlossaryTermMutation(); const [createGlossaryNodeMutation] = useCreateGlossaryNodeMutation(); + useEffect(() => { + if (props.isCloning && entityData.entityData) { + const { properties } = entityData.entityData; + + if (properties?.name) { + setStagedName(properties.name); + form.setFieldValue('name', properties.name); + } + + if (properties?.description) { + setDocumentation(properties.description); + } + } + }, [props.isCloning, entityData.entityData, form]); + function createGlossaryEntity() { const mutation = entityType === EntityType.GlossaryTerm ? createGlossaryTermMutation : createGlossaryNodeMutation; @@ -67,7 +86,7 @@ function CreateGlossaryEntityModal(props: Props) { }, }, }) - .then(() => { + .then((res) => { message.loading({ content: 'Updating...', duration: 2 }); setTimeout(() => { analytics.event({ @@ -82,12 +101,19 @@ function CreateGlossaryEntityModal(props: Props) { refetch(); if (isInGlossaryContext) { // either refresh this current glossary node or the root nodes or root terms - const nodeToUpdate = entityData?.urn || getGlossaryRootToUpdate(entityType); + const nodeToUpdate = selectedParentUrn || getGlossaryRootToUpdate(entityType); updateGlossarySidebar([nodeToUpdate], urnsToUpdate, setUrnsToUpdate); } if (refetchData) { refetchData(); } + if (props.isCloning) { + const redirectUrn = + entityType === EntityType.GlossaryTerm + ? res.data?.createGlossaryTerm + : res.data?.createGlossaryNode; + history.push(getEntityPath(entityType, redirectUrn, entityRegistry, false, false)); + } }, 2000); }) .catch((e) => { diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx index 5d4f9d9f875cfe..8d7f1cca9c1cbd 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx @@ -9,6 +9,7 @@ import { LinkOutlined, MoreOutlined, PlusOutlined, + CopyOutlined, } from '@ant-design/icons'; import { Redirect } from 'react-router'; import { EntityType } from '../../../../types.generated'; @@ -32,6 +33,7 @@ export enum EntityMenuItems { ADD_TERM_GROUP, DELETE, MOVE, + CLONE, } export const MenuIcon = styled(MoreOutlined)<{ fontSize?: number }>` @@ -107,6 +109,7 @@ function EntityDropdown(props: Props) { const [isCreateTermModalVisible, setIsCreateTermModalVisible] = useState(false); const [isCreateNodeModalVisible, setIsCreateNodeModalVisible] = useState(false); + const [isCloneEntityModalVisible, setIsCloneEntityModalVisible] = useState(false); const [isDeprecationModalVisible, setIsDeprecationModalVisible] = useState(false); const [isMoveModalVisible, setIsMoveModalVisible] = useState(false); @@ -230,6 +233,17 @@ function EntityDropdown(props: Props) { )} + {menuItems.has(EntityMenuItems.CLONE) && ( + setIsCloneEntityModalVisible(true)} + > + +  Clone + + + )} } trigger={['click']} @@ -250,6 +264,14 @@ function EntityDropdown(props: Props) { refetchData={refetchForNodes} /> )} + {isCloneEntityModalVisible && ( + setIsCloneEntityModalVisible(false)} + refetchData={entityType === EntityType.GlossaryTerm ? refetchForTerms : refetchForNodes} + isCloning + /> + )} {isDeprecationModalVisible && ( ; properties?: Maybe<{ + name?: Maybe; description?: Maybe; qualifiedName?: Maybe; sourceUrl?: Maybe; From a495d652e0e08885ce35eb3110a27853c2c05071 Mon Sep 17 00:00:00 2001 From: skrydal Date: Wed, 13 Dec 2023 20:34:20 +0100 Subject: [PATCH 03/17] feat(ingestion): Add typeUrn handling to ownership transformers (#9370) --- .../docs/transformer/dataset_transformer.md | 32 +++++++------- .../src/datahub/emitter/mce_builder.py | 31 ++++++------- .../transformer/add_dataset_ownership.py | 34 +++++--------- .../tests/unit/test_pipeline.py | 5 ++- .../tests/unit/test_transform_dataset.py | 44 ++++++++++++++++++- 5 files changed, 86 insertions(+), 60 deletions(-) diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index d1a1555a3ca022..1c84a2759d23e6 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -55,12 +55,12 @@ transformers: ``` ## Simple Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|-----------------------------|----------|--------------|---------------|------------------------------------------------------------------| -| `owner_urns` | ✅ | list[string] | | List of owner urns. | -| `ownership_type` | | string | `DATAOWNER` | ownership type of the owners. | -| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| Field | Required | Type | Default | Description | +|--------------------|----------|--------------|-------------|---------------------------------------------------------------------| +| `owner_urns` | ✅ | list[string] | | List of owner urns. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics). @@ -95,7 +95,7 @@ transformers: - "urn:li:corpuser:username1" - "urn:li:corpuser:username2" - "urn:li:corpGroup:groupname" - ownership_type: "PRODUCER" + ownership_type: "urn:li:ownershipType:__system__producer" ``` - Add owners, however overwrite the owners available for the dataset on DataHub GMS ```yaml @@ -107,7 +107,7 @@ transformers: - "urn:li:corpuser:username1" - "urn:li:corpuser:username2" - "urn:li:corpGroup:groupname" - ownership_type: "PRODUCER" + ownership_type: "urn:li:ownershipType:__system__producer" ``` - Add owners, however keep the owners available for the dataset on DataHub GMS ```yaml @@ -124,12 +124,12 @@ transformers: ## Pattern Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|-----------------------------|--------- |-----------------------|------------------|-----------------------------------------------------------------------------------------| -| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | -| `ownership_type` | | string | `DATAOWNER` | ownership type of the owners. | -| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| Field | Required | Type | Default | Description | +|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------| +| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners. @@ -158,7 +158,7 @@ The config, which we’d append to our ingestion recipe YAML, would look like th rules: ".*example1.*": ["urn:li:corpuser:username1"] ".*example2.*": ["urn:li:corpuser:username2"] - ownership_type: "PRODUCER" + ownership_type: "urn:li:ownershipType:__system__producer" ``` - Add owner, however overwrite the owners available for the dataset on DataHub GMS ```yaml @@ -170,7 +170,7 @@ The config, which we’d append to our ingestion recipe YAML, would look like th rules: ".*example1.*": ["urn:li:corpuser:username1"] ".*example2.*": ["urn:li:corpuser:username2"] - ownership_type: "PRODUCER" + ownership_type: "urn:li:ownershipType:__system__producer" ``` - Add owner, however keep the owners available for the dataset on DataHub GMS ```yaml diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 64c9ec1bb5704d..3b2c87ea25a314 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -9,12 +9,13 @@ from typing import ( TYPE_CHECKING, Any, + Iterable, List, Optional, + Tuple, Type, TypeVar, Union, - cast, get_type_hints, ) @@ -342,26 +343,20 @@ def make_ml_model_group_urn(platform: str, group_name: str, env: str) -> str: ) -def is_valid_ownership_type(ownership_type: Optional[str]) -> bool: - return ownership_type is not None and ownership_type in [ - OwnershipTypeClass.TECHNICAL_OWNER, - OwnershipTypeClass.BUSINESS_OWNER, - OwnershipTypeClass.DATA_STEWARD, - OwnershipTypeClass.NONE, - OwnershipTypeClass.DEVELOPER, - OwnershipTypeClass.DATAOWNER, - OwnershipTypeClass.DELEGATE, - OwnershipTypeClass.PRODUCER, - OwnershipTypeClass.CONSUMER, - OwnershipTypeClass.STAKEHOLDER, +def get_class_fields(_class: Type[object]) -> Iterable[str]: + return [ + f + for f in dir(_class) + if not callable(getattr(_class, f)) and not f.startswith("_") ] -def validate_ownership_type(ownership_type: Optional[str]) -> str: - if is_valid_ownership_type(ownership_type): - return cast(str, ownership_type) - else: - raise ValueError(f"Unexpected ownership type: {ownership_type}") +def validate_ownership_type(ownership_type: str) -> Tuple[str, Optional[str]]: + if ownership_type.startswith("urn:li:"): + return OwnershipTypeClass.CUSTOM, ownership_type + if ownership_type in get_class_fields(OwnershipTypeClass): + return ownership_type, None + raise ValueError(f"Unexpected ownership type: {ownership_type}") def make_lineage_mce( diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py index 71cf6cfa7e92bf..73cb8e4d6739bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py @@ -14,11 +14,8 @@ from datahub.ingestion.transformer.dataset_transformer import ( DatasetOwnershipTransformer, ) -from datahub.metadata.schema_classes import ( - OwnerClass, - OwnershipClass, - OwnershipTypeClass, -) +from datahub.metadata._schema_classes import OwnershipTypeClass +from datahub.metadata.schema_classes import OwnerClass, OwnershipClass class AddDatasetOwnershipConfig(TransformerSemanticsConfigModel): @@ -102,7 +99,7 @@ def transform_aspect( class DatasetOwnershipBaseConfig(TransformerSemanticsConfigModel): - ownership_type: Optional[str] = OwnershipTypeClass.DATAOWNER + ownership_type: str = OwnershipTypeClass.DATAOWNER class SimpleDatasetOwnershipConfig(DatasetOwnershipBaseConfig): @@ -114,11 +111,14 @@ class SimpleAddDatasetOwnership(AddDatasetOwnership): """Transformer that adds a specified set of owners to each dataset.""" def __init__(self, config: SimpleDatasetOwnershipConfig, ctx: PipelineContext): - ownership_type = builder.validate_ownership_type(config.ownership_type) + ownership_type, ownership_type_urn = builder.validate_ownership_type( + config.ownership_type + ) owners = [ OwnerClass( owner=owner, type=ownership_type, + typeUrn=ownership_type_urn, ) for owner in config.owner_urns ] @@ -147,29 +147,17 @@ class PatternDatasetOwnershipConfig(DatasetOwnershipBaseConfig): class PatternAddDatasetOwnership(AddDatasetOwnership): """Transformer that adds a specified set of owners to each dataset.""" - def getOwners( - self, - key: str, - owner_pattern: KeyValuePattern, - ownership_type: Optional[str] = None, - ) -> List[OwnerClass]: - owners = [ - OwnerClass( - owner=owner, - type=builder.validate_ownership_type(ownership_type), - ) - for owner in owner_pattern.value(key) - ] - return owners - def __init__(self, config: PatternDatasetOwnershipConfig, ctx: PipelineContext): - ownership_type = builder.validate_ownership_type(config.ownership_type) owner_pattern = config.owner_pattern + ownership_type, ownership_type_urn = builder.validate_ownership_type( + config.ownership_type + ) generic_config = AddDatasetOwnershipConfig( get_owners_to_add=lambda urn: [ OwnerClass( owner=owner, type=ownership_type, + typeUrn=ownership_type_urn, ) for owner in owner_pattern.value(urn) ], diff --git a/metadata-ingestion/tests/unit/test_pipeline.py b/metadata-ingestion/tests/unit/test_pipeline.py index 7ce78f0ab3e13a..0f3c984196a784 100644 --- a/metadata-ingestion/tests/unit/test_pipeline.py +++ b/metadata-ingestion/tests/unit/test_pipeline.py @@ -214,7 +214,10 @@ def test_run_including_registered_transformation(self): "transformers": [ { "type": "simple_add_dataset_ownership", - "config": {"owner_urns": ["urn:li:corpuser:foo"]}, + "config": { + "owner_urns": ["urn:li:corpuser:foo"], + "ownership_type": "urn:li:ownershipType:__system__technical_owner", + }, } ], "sink": {"type": "tests.test_helpers.sink_helpers.RecordingSink"}, diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index bc95451620d22f..8014df2f5c519d 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -234,7 +234,7 @@ def test_simple_dataset_ownership_transformation(mock_time): assert last_event.entityUrn == outputs[0].record.proposedSnapshot.urn assert all( [ - owner.type == models.OwnershipTypeClass.DATAOWNER + owner.type == models.OwnershipTypeClass.DATAOWNER and owner.typeUrn is None for owner in last_event.aspect.owners ] ) @@ -247,7 +247,7 @@ def test_simple_dataset_ownership_transformation(mock_time): assert len(second_ownership_aspect.owners) == 3 assert all( [ - owner.type == models.OwnershipTypeClass.DATAOWNER + owner.type == models.OwnershipTypeClass.DATAOWNER and owner.typeUrn is None for owner in second_ownership_aspect.owners ] ) @@ -293,6 +293,44 @@ def test_simple_dataset_ownership_with_type_transformation(mock_time): assert ownership_aspect.owners[0].type == models.OwnershipTypeClass.PRODUCER +def test_simple_dataset_ownership_with_type_urn_transformation(mock_time): + input = make_generic_dataset() + + transformer = SimpleAddDatasetOwnership.create( + { + "owner_urns": [ + builder.make_user_urn("person1"), + ], + "ownership_type": "urn:li:ownershipType:__system__technical_owner", + }, + PipelineContext(run_id="test"), + ) + + output = list( + transformer.transform( + [ + RecordEnvelope(input, metadata={}), + RecordEnvelope(EndOfStream(), metadata={}), + ] + ) + ) + + assert len(output) == 3 + + # original MCE is unchanged + assert input == output[0].record + + ownership_aspect = output[1].record.aspect + + assert isinstance(ownership_aspect, OwnershipClass) + assert len(ownership_aspect.owners) == 1 + assert ownership_aspect.owners[0].type == OwnershipTypeClass.CUSTOM + assert ( + ownership_aspect.owners[0].typeUrn + == "urn:li:ownershipType:__system__technical_owner" + ) + + def _test_extract_tags(in_urn: str, regex_str: str, out_tag: str) -> None: input = make_generic_dataset(entity_urn=in_urn) transformer = ExtractDatasetTags.create( @@ -883,6 +921,7 @@ def test_pattern_dataset_ownership_transformation(mock_time): ".*example2.*": [builder.make_user_urn("person2")], } }, + "ownership_type": "DATAOWNER", }, PipelineContext(run_id="test"), ) @@ -2233,6 +2272,7 @@ def fake_ownership_class(entity_urn: str) -> models.OwnershipClass: "replace_existing": False, "semantics": TransformerSemantics.PATCH, "owner_urns": [owner2], + "ownership_type": "DATAOWNER", }, pipeline_context=pipeline_context, ) From 32d237b56f54c83bd7b8d343b04d36f53ae72d0a Mon Sep 17 00:00:00 2001 From: Arun Vasudevan <12974850+arunvasudevan@users.noreply.github.com> Date: Wed, 13 Dec 2023 16:02:21 -0600 Subject: [PATCH 04/17] fix(ingest): reduce GraphQL Logs to warning for circuit breaker (#9436) --- .../src/datahub/api/circuit_breaker/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py index 4dcf40454736b9..27317826264b85 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py @@ -1,3 +1,7 @@ +import logging + +from gql.transport.requests import log as requests_logger + from datahub.api.circuit_breaker.assertion_circuit_breaker import ( AssertionCircuitBreaker, AssertionCircuitBreakerConfig, @@ -6,3 +10,5 @@ OperationCircuitBreaker, OperationCircuitBreakerConfig, ) + +requests_logger.setLevel(logging.WARNING) From 288e458739ec15e0d294ed5c0eb54963fee01071 Mon Sep 17 00:00:00 2001 From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com> Date: Thu, 14 Dec 2023 06:19:05 +0530 Subject: [PATCH 05/17] refactor(ui): support Apollo caching for settings / Policies (#9442) --- .../app/permissions/policy/ManagePolicies.tsx | 194 ++------------- .../policy/_tests_/policyUtils.test.tsx | 110 +++++++++ .../src/app/permissions/policy/policyUtils.ts | 98 ++++++++ .../src/app/permissions/policy/usePolicy.ts | 227 ++++++++++++++++++ 4 files changed, 460 insertions(+), 169 deletions(-) create mode 100644 datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx create mode 100644 datahub-web-react/src/app/permissions/policy/usePolicy.ts diff --git a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx index 2f0c284fc4e8f3..72c22f3bddc2cd 100644 --- a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx +++ b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx @@ -1,5 +1,5 @@ import React, { useEffect, useMemo, useState } from 'react'; -import { Button, Empty, message, Modal, Pagination, Tag } from 'antd'; +import { Button, Empty, message, Pagination, Tag } from 'antd'; import styled from 'styled-components/macro'; import * as QueryString from 'query-string'; import { DeleteOutlined, PlusOutlined } from '@ant-design/icons'; @@ -7,26 +7,15 @@ import { useLocation } from 'react-router'; import PolicyBuilderModal from './PolicyBuilderModal'; import { Policy, - PolicyUpdateInput, PolicyState, - PolicyType, - Maybe, - ResourceFilterInput, - PolicyMatchFilter, - PolicyMatchFilterInput, - PolicyMatchCriterionInput, - EntityType, } from '../../../types.generated'; import { useAppConfig } from '../../useAppConfig'; import PolicyDetailsModal from './PolicyDetailsModal'; import { - useCreatePolicyMutation, - useDeletePolicyMutation, useListPoliciesQuery, - useUpdatePolicyMutation, } from '../../../graphql/policy.generated'; import { Message } from '../../shared/Message'; -import { EMPTY_POLICY } from './policyUtils'; +import { DEFAULT_PAGE_SIZE, EMPTY_POLICY } from './policyUtils'; import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; import { StyledTable } from '../../entity/shared/components/styled/StyledTable'; import AvatarsGroup from '../AvatarsGroup'; @@ -37,6 +26,7 @@ import { scrollToTop } from '../../shared/searchUtils'; import analytics, { EventType } from '../../analytics'; import { POLICIES_CREATE_POLICY_ID, POLICIES_INTRO_ID } from '../../onboarding/config/PoliciesOnboardingConfig'; import { OnboardingTour } from '../../onboarding/OnboardingTour'; +import { usePolicy } from './usePolicy'; const SourceContainer = styled.div` overflow: auto; @@ -84,58 +74,6 @@ const PageContainer = styled.span` overflow: auto; `; -const DEFAULT_PAGE_SIZE = 10; - -type PrivilegeOptionType = { - type?: string; - name?: Maybe; -}; - -const toFilterInput = (filter: PolicyMatchFilter): PolicyMatchFilterInput => { - return { - criteria: filter.criteria?.map((criterion): PolicyMatchCriterionInput => { - return { - field: criterion.field, - values: criterion.values.map((criterionValue) => criterionValue.value), - condition: criterion.condition, - }; - }), - }; -}; - -const toPolicyInput = (policy: Omit): PolicyUpdateInput => { - let policyInput: PolicyUpdateInput = { - type: policy.type, - name: policy.name, - state: policy.state, - description: policy.description, - privileges: policy.privileges, - actors: { - users: policy.actors.users, - groups: policy.actors.groups, - allUsers: policy.actors.allUsers, - allGroups: policy.actors.allGroups, - resourceOwners: policy.actors.resourceOwners, - resourceOwnersTypes: policy.actors.resourceOwnersTypes, - }, - }; - if (policy.resources !== null && policy.resources !== undefined) { - let resourceFilter: ResourceFilterInput = { - type: policy.resources.type, - resources: policy.resources.resources, - allResources: policy.resources.allResources, - }; - if (policy.resources.filter) { - resourceFilter = { ...resourceFilter, filter: toFilterInput(policy.resources.filter) }; - } - // Add the resource filters. - policyInput = { - ...policyInput, - resources: resourceFilter, - }; - } - return policyInput; -}; // TODO: Cleanup the styling. export const ManagePolicies = () => { @@ -163,9 +101,7 @@ export const ManagePolicies = () => { const [focusPolicyUrn, setFocusPolicyUrn] = useState(undefined); const [focusPolicy, setFocusPolicy] = useState>(EMPTY_POLICY); - // Construct privileges - const platformPrivileges = policiesConfig?.platformPrivileges || []; - const resourcePrivileges = policiesConfig?.resourcePrivileges || []; + const { loading: policiesLoading, @@ -183,15 +119,6 @@ export const ManagePolicies = () => { fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', }); - // Any time a policy is removed, edited, or created, refetch the list. - const [createPolicy, { error: createPolicyError }] = useCreatePolicyMutation(); - - const [updatePolicy, { error: updatePolicyError }] = useUpdatePolicyMutation(); - - const [deletePolicy, { error: deletePolicyError }] = useDeletePolicyMutation(); - - const updateError = createPolicyError || updatePolicyError || deletePolicyError; - const totalPolicies = policiesData?.listPolicies?.total || 0; const policies = useMemo(() => policiesData?.listPolicies?.policies || [], [policiesData]); @@ -212,28 +139,6 @@ export const ManagePolicies = () => { setShowPolicyBuilderModal(false); }; - const getPrivilegeNames = (policy: Omit) => { - let privileges: PrivilegeOptionType[] = []; - if (policy?.type === PolicyType.Platform) { - privileges = platformPrivileges - .filter((platformPrivilege) => policy.privileges.includes(platformPrivilege.type)) - .map((platformPrivilege) => { - return { type: platformPrivilege.type, name: platformPrivilege.displayName }; - }); - } else { - const allResourcePriviliges = resourcePrivileges.find( - (resourcePrivilege) => resourcePrivilege.resourceType === 'all', - ); - privileges = - allResourcePriviliges?.privileges - .filter((resourcePrivilege) => policy.privileges.includes(resourcePrivilege.type)) - .map((b) => { - return { type: b.type, name: b.displayName }; - }) || []; - } - return privileges; - }; - const onViewPolicy = (policy: Policy) => { setShowViewPolicyModal(true); setFocusPolicyUrn(policy?.urn); @@ -247,79 +152,30 @@ export const ManagePolicies = () => { }; const onEditPolicy = (policy: Policy) => { - setShowPolicyBuilderModal(true); - setFocusPolicyUrn(policy?.urn); - setFocusPolicy({ ...policy }); - }; - - // On Delete Policy handler - const onRemovePolicy = (policy: Policy) => { - Modal.confirm({ - title: `Delete ${policy?.name}`, - content: `Are you sure you want to remove policy?`, - onOk() { - deletePolicy({ variables: { urn: policy?.urn as string } }); // There must be a focus policy urn. - analytics.event({ - type: EventType.DeleteEntityEvent, - entityUrn: policy?.urn, - entityType: EntityType.DatahubPolicy, - }); - message.success('Successfully removed policy.'); - setTimeout(() => { - policiesRefetch(); - }, 3000); - onCancelViewPolicy(); - }, - onCancel() {}, - okText: 'Yes', - maskClosable: true, - closable: true, - }); + setShowPolicyBuilderModal(true); + setFocusPolicyUrn(policy?.urn); + setFocusPolicy({ ...policy }); }; - // On Activate and deactivate Policy handler - const onToggleActiveDuplicate = (policy: Policy) => { - const newState = policy?.state === PolicyState.Active ? PolicyState.Inactive : PolicyState.Active; - const newPolicy = { - ...policy, - state: newState, - }; - updatePolicy({ - variables: { - urn: policy?.urn as string, // There must be a focus policy urn. - input: toPolicyInput(newPolicy), - }, - }); - message.success(`Successfully ${newState === PolicyState.Active ? 'activated' : 'deactivated'} policy.`); - setTimeout(() => { - policiesRefetch(); - }, 3000); - setShowViewPolicyModal(false); - }; - - // On Add/Update Policy handler - const onSavePolicy = (savePolicy: Omit) => { - if (focusPolicyUrn) { - // If there's an URN associated with the focused policy, then we are editing an existing policy. - updatePolicy({ variables: { urn: focusPolicyUrn, input: toPolicyInput(savePolicy) } }); - analytics.event({ - type: EventType.UpdatePolicyEvent, - policyUrn: focusPolicyUrn, - }); - } else { - // If there's no URN associated with the focused policy, then we are creating. - createPolicy({ variables: { input: toPolicyInput(savePolicy) } }); - analytics.event({ - type: EventType.CreatePolicyEvent, - }); - } - message.success('Successfully saved policy.'); - setTimeout(() => { - policiesRefetch(); - }, 3000); - onClosePolicyBuilder(); - }; + const { + createPolicyError, + updatePolicyError, + deletePolicyError, + onSavePolicy, + onToggleActiveDuplicate, + onRemovePolicy, + getPrivilegeNames + } = usePolicy( + policiesConfig, + focusPolicyUrn, + policiesRefetch, + setShowViewPolicyModal, + onCancelViewPolicy, + onClosePolicyBuilder + ); + const updateError = createPolicyError || updatePolicyError || deletePolicyError; + const tableColumns = [ { title: 'Name', diff --git a/datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx b/datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx new file mode 100644 index 00000000000000..06d2e97255139e --- /dev/null +++ b/datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx @@ -0,0 +1,110 @@ +import { + addOrUpdatePoliciesInList, + updateListPoliciesCache, + removeFromListPoliciesCache, + } from '../policyUtils'; + + // Mock the Apollo Client readQuery and writeQuery methods + const mockReadQuery = jest.fn(); + const mockWriteQuery = jest.fn(); + + jest.mock('@apollo/client', () => ({ + ...jest.requireActual('@apollo/client'), + useApolloClient: () => ({ + readQuery: mockReadQuery, + writeQuery: mockWriteQuery, + }), + })); + + describe('addOrUpdatePoliciesInList', () => { + it('should add a new policy to the list', () => { + const existingPolicies = [{ urn: 'existing-urn' }]; + const newPolicies = { urn: 'new-urn' }; + + const result = addOrUpdatePoliciesInList(existingPolicies, newPolicies); + + expect(result.length).toBe(existingPolicies.length + 1); + expect(result).toContain(newPolicies); + }); + + it('should update an existing policy in the list', () => { + const existingPolicies = [{ urn: 'existing-urn' }]; + const newPolicies = { urn: 'existing-urn', updatedField: 'new-value' }; + + const result = addOrUpdatePoliciesInList(existingPolicies, newPolicies); + + expect(result.length).toBe(existingPolicies.length); + expect(result).toContainEqual(newPolicies); + }); + }); + + describe('updateListPoliciesCache', () => { + // Mock client.readQuery response + const mockReadQueryResponse = { + listPolicies: { + start: 0, + count: 1, + total: 1, + policies: [{ urn: 'existing-urn' }], + }, + }; + + beforeEach(() => { + mockReadQuery.mockReturnValueOnce(mockReadQueryResponse); + }); + + it('should update the list policies cache with a new policy', () => { + const mockClient = { + readQuery: mockReadQuery, + writeQuery: mockWriteQuery, + }; + + const policiesToAdd = [{ urn: 'new-urn' }]; + const pageSize = 10; + + updateListPoliciesCache(mockClient, policiesToAdd, pageSize); + + // Ensure writeQuery is called with the expected data + expect(mockWriteQuery).toHaveBeenCalledWith({ + query: expect.any(Object), + variables: { input: { start: 0, count: pageSize, query: undefined } }, + data: expect.any(Object), + }); + }); + }); + + describe('removeFromListPoliciesCache', () => { + // Mock client.readQuery response + const mockReadQueryResponse = { + listPolicies: { + start: 0, + count: 1, + total: 1, + policies: [{ urn: 'existing-urn' }], + }, + }; + + beforeEach(() => { + mockReadQuery.mockReturnValueOnce(mockReadQueryResponse); + }); + + it('should remove a policy from the list policies cache', () => { + const mockClient = { + readQuery: mockReadQuery, + writeQuery: mockWriteQuery, + }; + + const urnToRemove = 'existing-urn'; + const pageSize = 10; + + removeFromListPoliciesCache(mockClient, urnToRemove, pageSize); + + // Ensure writeQuery is called with the expected data + expect(mockWriteQuery).toHaveBeenCalledWith({ + query: expect.any(Object), + variables: { input: { start: 0, count: pageSize } }, + data: expect.any(Object), + }); + }); + }); + \ No newline at end of file diff --git a/datahub-web-react/src/app/permissions/policy/policyUtils.ts b/datahub-web-react/src/app/permissions/policy/policyUtils.ts index 2f178fcdeb5c34..27aa8fcd351e9b 100644 --- a/datahub-web-react/src/app/permissions/policy/policyUtils.ts +++ b/datahub-web-react/src/app/permissions/policy/policyUtils.ts @@ -10,6 +10,9 @@ import { ResourceFilter, ResourcePrivileges, } from '../../../types.generated'; +import { ListPoliciesDocument, ListPoliciesQuery } from '../../../graphql/policy.generated'; + +export const DEFAULT_PAGE_SIZE = 10; export const EMPTY_POLICY = { type: PolicyType.Metadata, @@ -126,3 +129,98 @@ export const setFieldValues = ( } return { ...filter, criteria: [...restCriteria, createCriterion(resourceFieldType, fieldValues)] }; }; + +export const addOrUpdatePoliciesInList = (existingPolicies, newPolicies) => { + const policies = [...existingPolicies]; + let didUpdate = false; + const updatedPolicies = policies.map((policy) => { + if (policy.urn === newPolicies.urn) { + didUpdate = true; + return newPolicies; + } + return policy; + }); + return didUpdate ? updatedPolicies : [newPolicies, ...existingPolicies]; +}; + +/** + * Add an entry to the ListPolicies cache. + */ +export const updateListPoliciesCache = (client, policies, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPoliciesQuery | null = client.readQuery({ + query: ListPoliciesDocument, + variables: { + input: { + start: 0, + count: pageSize, + query: undefined, + }, + }, + }); + + // Add our new policy into the existing list. + const existingPolicies = [...(currData?.listPolicies?.policies || [])]; + const newPolicies = addOrUpdatePoliciesInList(existingPolicies, policies); + const didAddTest = newPolicies.length > existingPolicies.length; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPoliciesDocument, + variables: { + input: { + start: 0, + count: pageSize, + query: undefined, + }, + }, + data: { + + listPolicies: { + __typename: 'ListPoliciesResult', + start: 0, + count: didAddTest ? (currData?.listPolicies?.count || 0) + 1 : currData?.listPolicies?.count, + total: didAddTest ? (currData?.listPolicies?.total || 0) + 1 : currData?.listPolicies?.total, + policies: newPolicies, + }, + }, + }); +}; + +/** + * Remove an entry from the ListTests cache. + */ +export const removeFromListPoliciesCache = (client, urn, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPoliciesQuery | null = client.readQuery({ + query: ListPoliciesDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + }); + + // Remove the policy from the existing tests set. + const newPolicies = [...(currData?.listPolicies?.policies || []).filter((policy) => policy.urn !== urn)]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPoliciesDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + data: { + listPolicies: { + start: currData?.listPolicies?.start || 0, + count: (currData?.listPolicies?.count || 1) - 1, + total: (currData?.listPolicies?.total || 1) - 1, + policies: newPolicies, + }, + }, + }); +}; diff --git a/datahub-web-react/src/app/permissions/policy/usePolicy.ts b/datahub-web-react/src/app/permissions/policy/usePolicy.ts new file mode 100644 index 00000000000000..6f359805e42db1 --- /dev/null +++ b/datahub-web-react/src/app/permissions/policy/usePolicy.ts @@ -0,0 +1,227 @@ +import { Modal, message } from 'antd'; +import { useApolloClient } from '@apollo/client'; +import { + EntityType, + Policy, + PolicyMatchCriterionInput, + PolicyMatchFilter, + PolicyMatchFilterInput, + PolicyState, + PolicyType, + Maybe, + PolicyUpdateInput, + ResourceFilterInput, +} from '../../../types.generated'; +import { useCreatePolicyMutation, useDeletePolicyMutation, useUpdatePolicyMutation } from '../../../graphql/policy.generated'; +import analytics, { EventType } from '../../analytics'; +import { DEFAULT_PAGE_SIZE, removeFromListPoliciesCache, updateListPoliciesCache } from './policyUtils'; + + +type PrivilegeOptionType = { + type?: string; + name?: Maybe; +}; + +export function usePolicy( + policiesConfig, + focusPolicyUrn, + policiesRefetch, + setShowViewPolicyModal, + onCancelViewPolicy, + onClosePolicyBuilder +){ + + const client = useApolloClient(); + + // Construct privileges + const platformPrivileges = policiesConfig?.platformPrivileges || []; + const resourcePrivileges = policiesConfig?.resourcePrivileges || []; + + // Any time a policy is removed, edited, or created, refetch the list. + const [createPolicy, { error: createPolicyError }] = useCreatePolicyMutation(); + + const [updatePolicy, { error: updatePolicyError }] = useUpdatePolicyMutation(); + + const [deletePolicy, { error: deletePolicyError }] = useDeletePolicyMutation(); + + const toFilterInput = (filter: PolicyMatchFilter): PolicyMatchFilterInput => { + return { + criteria: filter.criteria?.map((criterion): PolicyMatchCriterionInput => { + return { + field: criterion.field, + values: criterion.values.map((criterionValue) => criterionValue.value), + condition: criterion.condition, + }; + }), + }; + }; + + const toPolicyInput = (policy: Omit): PolicyUpdateInput => { + let policyInput: PolicyUpdateInput = { + type: policy.type, + name: policy.name, + state: policy.state, + description: policy.description, + privileges: policy.privileges, + actors: { + users: policy.actors.users, + groups: policy.actors.groups, + allUsers: policy.actors.allUsers, + allGroups: policy.actors.allGroups, + resourceOwners: policy.actors.resourceOwners, + resourceOwnersTypes: policy.actors.resourceOwnersTypes, + }, + }; + if (policy.resources !== null && policy.resources !== undefined) { + let resourceFilter: ResourceFilterInput = { + type: policy.resources.type, + resources: policy.resources.resources, + allResources: policy.resources.allResources, + }; + if (policy.resources.filter) { + resourceFilter = { ...resourceFilter, filter: toFilterInput(policy.resources.filter) }; + } + // Add the resource filters. + policyInput = { + ...policyInput, + resources: resourceFilter, + }; + } + return policyInput; + }; + + const getPrivilegeNames = (policy: Omit) => { + let privileges: PrivilegeOptionType[] = []; + if (policy?.type === PolicyType.Platform) { + privileges = platformPrivileges + .filter((platformPrivilege) => policy.privileges.includes(platformPrivilege.type)) + .map((platformPrivilege) => { + return { type: platformPrivilege.type, name: platformPrivilege.displayName }; + }); + } else { + const allResourcePriviliges = resourcePrivileges.find( + (resourcePrivilege) => resourcePrivilege.resourceType === 'all', + ); + privileges = + allResourcePriviliges?.privileges + .filter((resourcePrivilege) => policy.privileges.includes(resourcePrivilege.type)) + .map((b) => { + return { type: b.type, name: b.displayName }; + }) || []; + } + return privileges; + }; + + // On Delete Policy handler + const onRemovePolicy = (policy: Policy) => { + Modal.confirm({ + title: `Delete ${policy?.name}`, + content: `Are you sure you want to remove policy?`, + onOk() { + deletePolicy({ variables: { urn: policy?.urn as string } }) + .then(()=>{ + // There must be a focus policy urn. + analytics.event({ + type: EventType.DeleteEntityEvent, + entityUrn: policy?.urn, + entityType: EntityType.DatahubPolicy, + }); + message.success('Successfully removed policy.'); + removeFromListPoliciesCache(client,policy?.urn, DEFAULT_PAGE_SIZE); + setTimeout(() => { + policiesRefetch(); + }, 3000); + onCancelViewPolicy(); + }) + }, + onCancel() {}, + okText: 'Yes', + maskClosable: true, + closable: true, + }); + }; + + // On Activate and deactivate Policy handler + const onToggleActiveDuplicate = (policy: Policy) => { + const newState = policy?.state === PolicyState.Active ? PolicyState.Inactive : PolicyState.Active; + const newPolicy = { + ...policy, + state: newState, + }; + updatePolicy({ + variables: { + urn: policy?.urn as string, // There must be a focus policy urn. + input: toPolicyInput(newPolicy), + }, + }).then(()=>{ + const updatePolicies= { + ...newPolicy, + __typename: 'ListPoliciesResult', + } + updateListPoliciesCache(client,updatePolicies,DEFAULT_PAGE_SIZE); + message.success(`Successfully ${newState === PolicyState.Active ? 'activated' : 'deactivated'} policy.`); + setTimeout(() => { + policiesRefetch(); + }, 3000); + }) + + setShowViewPolicyModal(false); + }; + + // On Add/Update Policy handler + const onSavePolicy = (savePolicy: Omit) => { + if (focusPolicyUrn) { + // If there's an URN associated with the focused policy, then we are editing an existing policy. + updatePolicy({ variables: { urn: focusPolicyUrn, input: toPolicyInput(savePolicy) } }) + .then(()=>{ + const newPolicy = { + __typename: 'ListPoliciesResult', + urn: focusPolicyUrn, + ...savePolicy, + }; + analytics.event({ + type: EventType.UpdatePolicyEvent, + policyUrn: focusPolicyUrn, + }); + message.success('Successfully saved policy.'); + updateListPoliciesCache(client,newPolicy,DEFAULT_PAGE_SIZE); + setTimeout(() => { + policiesRefetch(); + }, 1000); + onClosePolicyBuilder(); + }) + } else { + // If there's no URN associated with the focused policy, then we are creating. + createPolicy({ variables: { input: toPolicyInput(savePolicy) } }) + .then((result)=>{ + const newPolicy = { + __typename: 'ListPoliciesResult', + urn: result?.data?.createPolicy, + ...savePolicy, + type: null, + actors: null, + resources: null, + }; + analytics.event({ + type: EventType.CreatePolicyEvent, + }); + message.success('Successfully saved policy.'); + setTimeout(() => { + policiesRefetch(); + }, 1000); + updateListPoliciesCache(client,newPolicy,DEFAULT_PAGE_SIZE); + onClosePolicyBuilder(); + }) + } + }; + + return{ + createPolicyError, + updatePolicyError, + deletePolicyError, + onSavePolicy, + onToggleActiveDuplicate, + onRemovePolicy, + getPrivilegeNames, + } +} \ No newline at end of file From b87f9774ae646180675023196871f5965a5d97c3 Mon Sep 17 00:00:00 2001 From: Sumit Patil <91715217+sumitappt@users.noreply.github.com> Date: Thu, 14 Dec 2023 06:41:30 +0530 Subject: [PATCH 06/17] =?UTF-8?q?refactor=20|=20PRD-785=20|=20datahub=20os?= =?UTF-8?q?s:=20migrate=20use=20of=20useGetAuthenticatedU=E2=80=A6=20(#945?= =?UTF-8?q?6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: John Joyce --- datahub-web-react/src/app/AdminConsole.tsx | 8 ++++---- datahub-web-react/src/app/embed/EmbeddedPage.tsx | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datahub-web-react/src/app/AdminConsole.tsx b/datahub-web-react/src/app/AdminConsole.tsx index 8b14ca35763d10..f6395a3bd3cb8a 100644 --- a/datahub-web-react/src/app/AdminConsole.tsx +++ b/datahub-web-react/src/app/AdminConsole.tsx @@ -4,9 +4,9 @@ import { Menu } from 'antd'; import styled from 'styled-components'; import { BankOutlined, BarChartOutlined, MenuOutlined } from '@ant-design/icons'; import Sider from 'antd/lib/layout/Sider'; -import { useGetAuthenticatedUser } from './useGetAuthenticatedUser'; import { useAppConfig } from './useAppConfig'; import { ANTD_GRAY } from './entity/shared/constants'; +import { useUserContext } from './context/useUserContext'; const ToggleContainer = styled.div` background-color: ${ANTD_GRAY[4]}; @@ -32,7 +32,7 @@ const ControlSlideOut = styled(Sider)` * Container for all views behind an authentication wall. */ export const AdminConsole = (): JSX.Element => { - const me = useGetAuthenticatedUser(); + const me = useUserContext(); const [adminConsoleOpen, setAdminConsoleOpen] = useState(false); const { config } = useAppConfig(); @@ -40,8 +40,8 @@ export const AdminConsole = (): JSX.Element => { const isAnalyticsEnabled = config?.analyticsConfig.enabled; const isPoliciesEnabled = config?.policiesConfig.enabled; - const showAnalytics = (isAnalyticsEnabled && me && me.platformPrivileges.viewAnalytics) || false; - const showPolicyBuilder = (isPoliciesEnabled && me && me.platformPrivileges.managePolicies) || false; + const showAnalytics = (isAnalyticsEnabled && me && me?.platformPrivileges?.viewAnalytics) || false; + const showPolicyBuilder = (isPoliciesEnabled && me && me?.platformPrivileges?.managePolicies) || false; const showAdminConsole = showAnalytics || showPolicyBuilder; const onMenuItemClick = () => { diff --git a/datahub-web-react/src/app/embed/EmbeddedPage.tsx b/datahub-web-react/src/app/embed/EmbeddedPage.tsx index 429f83f34af6e8..603a72675c4337 100644 --- a/datahub-web-react/src/app/embed/EmbeddedPage.tsx +++ b/datahub-web-react/src/app/embed/EmbeddedPage.tsx @@ -8,9 +8,9 @@ import { VIEW_ENTITY_PAGE } from '../entity/shared/constants'; import { decodeUrn } from '../entity/shared/utils'; import CompactContext from '../shared/CompactContext'; import { useEntityRegistry } from '../useEntityRegistry'; -import { useGetAuthenticatedUserUrn } from '../useGetAuthenticatedUser'; import analytics from '../analytics/analytics'; import { EventType } from '../analytics'; +import { useUserContext } from '../context/useUserContext'; const EmbeddedPageWrapper = styled.div` max-height: 100%; @@ -39,11 +39,11 @@ export default function EmbeddedPage({ entityType }: Props) { }); }, [entityType, urn]); - const authenticatedUserUrn = useGetAuthenticatedUserUrn(); + const { urn : authenticatedUserUrn } = useUserContext(); const { data } = useGetGrantedPrivilegesQuery({ variables: { input: { - actorUrn: authenticatedUserUrn, + actorUrn: authenticatedUserUrn as string, resourceSpec: { resourceType: entityType, resourceUrn: urn }, }, }, From ff0570edacdd967d8fef23ac3333ccc93e50e406 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Wed, 13 Dec 2023 17:12:48 -0800 Subject: [PATCH 07/17] refactor(ui): Minor improvements & refactoring (#9420) --- .../search/EmbeddedListSearchResults.tsx | 6 +- .../src/app/lineage/LineageLoadingSection.tsx | 5 +- datahub-web-react/src/graphql/domain.graphql | 4 +- datahub-web-react/src/graphql/lineage.graphql | 167 ++++++++++++------ datahub-web-react/src/graphql/query.graphql | 10 ++ .../com/linkedin/query/QueryProperties.pdl | 7 +- 6 files changed, 139 insertions(+), 60 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx index 1daf2a4c59b70f..80fc2aa223fdf5 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { Pagination, Typography } from 'antd'; +import { Pagination, Spin, Typography } from 'antd'; import { LoadingOutlined } from '@ant-design/icons'; import styled from 'styled-components'; import { FacetFilterInput, FacetMetadata, SearchResults as SearchResultType } from '../../../../../../types.generated'; @@ -61,7 +61,7 @@ const LoadingContainer = styled.div` `; const StyledLoading = styled(LoadingOutlined)` - font-size: 36px; + font-size: 32px; color: ${ANTD_GRAY[7]}; padding-bottom: 18px; ]`; @@ -128,7 +128,7 @@ export const EmbeddedListSearchResults = ({ {loading && ( - + } /> )} {!loading && ( diff --git a/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx b/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx index 9d84de0c211729..3b7f0e48ecdf4c 100644 --- a/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx +++ b/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx @@ -1,5 +1,6 @@ import * as React from 'react'; import styled from 'styled-components'; +import { Spin } from 'antd'; import { LoadingOutlined } from '@ant-design/icons'; import { ANTD_GRAY } from '../entity/shared/constants'; @@ -13,7 +14,7 @@ const Container = styled.div` `; const StyledLoading = styled(LoadingOutlined)` - font-size: 36px; + font-size: 32px; color: ${ANTD_GRAY[7]}; padding-bottom: 18px; ]`; @@ -21,7 +22,7 @@ const StyledLoading = styled(LoadingOutlined)` export default function LineageLoadingSection() { return ( - + } /> ); } diff --git a/datahub-web-react/src/graphql/domain.graphql b/datahub-web-react/src/graphql/domain.graphql index 951b93fcba9af1..170a5b5df476ba 100644 --- a/datahub-web-react/src/graphql/domain.graphql +++ b/datahub-web-react/src/graphql/domain.graphql @@ -27,9 +27,7 @@ query getDomain($urn: String!) { } } } - children: relationships(input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 0 }) { - total - } + ...domainEntitiesFields } } diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index dc511ca411e8db..4e9b8aacfcfa15 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -164,6 +164,9 @@ fragment lineageNodeProperties on EntityWithRelationships { domain { ...entityDomain } + parentContainers { + ...parentContainersFields + } ...entityDataProduct status { removed @@ -188,6 +191,9 @@ fragment lineageNodeProperties on EntityWithRelationships { ownership { ...ownershipFields } + parentContainers { + ...parentContainersFields + } subTypes { typeNames } @@ -361,6 +367,60 @@ fragment partialLineageResults on EntityLineageResult { filtered } +fragment entityLineage on Entity { + urn + type + ...lineageNodeProperties + ...canEditLineageFragment + ... on Dataset { + schemaMetadata(version: 0) @include(if: $showColumns) { + ...schemaMetadataFields + } + siblings { + isPrimary + siblings { + urn + type + ... on Dataset { + exists + } + ...lineageNodeProperties + } + } + } + ... on Chart { + inputFields @include(if: $showColumns) { + ...inputFieldsFields + } + } + ... on EntityWithRelationships { + upstream: lineage( + input: { + direction: UPSTREAM + start: 0 + count: 100 + separateSiblings: $separateSiblings + startTimeMillis: $startTimeMillis + endTimeMillis: $endTimeMillis + } + ) @skip(if: $excludeUpstream) { + ...fullLineageResults + } + downstream: lineage( + input: { + direction: DOWNSTREAM + start: 0 + count: 100 + separateSiblings: $separateSiblings + startTimeMillis: $startTimeMillis + endTimeMillis: $endTimeMillis + } + ) @skip(if: $excludeDownstream) { + ...fullLineageResults + } + } +} + query getEntityLineage( $urn: String! $separateSiblings: Boolean @@ -371,57 +431,21 @@ query getEntityLineage( $excludeDownstream: Boolean = false ) { entity(urn: $urn) { - urn - type - ...lineageNodeProperties - ...canEditLineageFragment - ... on Dataset { - schemaMetadata(version: 0) @include(if: $showColumns) { - ...schemaMetadataFields - } - siblings { - isPrimary - siblings { - urn - type - ... on Dataset { - exists - } - ...lineageNodeProperties - } - } - } - ... on Chart { - inputFields @include(if: $showColumns) { - ...inputFieldsFields - } - } - ... on EntityWithRelationships { - upstream: lineage( - input: { - direction: UPSTREAM - start: 0 - count: 100 - separateSiblings: $separateSiblings - startTimeMillis: $startTimeMillis - endTimeMillis: $endTimeMillis - } - ) @skip(if: $excludeUpstream) { - ...fullLineageResults - } - downstream: lineage( - input: { - direction: DOWNSTREAM - start: 0 - count: 100 - separateSiblings: $separateSiblings - startTimeMillis: $startTimeMillis - endTimeMillis: $endTimeMillis - } - ) @skip(if: $excludeDownstream) { - ...fullLineageResults - } - } + ...entityLineage + } +} + +query getBulkEntityLineage( + $urns: [String!]!, + $separateSiblings: Boolean + $showColumns: Boolean! + $startTimeMillis: Long + $endTimeMillis: Long + $excludeUpstream: Boolean = false + $excludeDownstream: Boolean = false +) { + entities(urns: $urns) { + ...entityLineage } } @@ -489,3 +513,44 @@ query getLineageCounts( } } } + +query getSearchAcrossLineageCounts( + $urn: String! + $excludeUpstream: Boolean = false + $excludeDownstream: Boolean = false +) { + upstreams: searchAcrossLineage( + input: { + urn: $urn + query: "*" + start: 0 + count: 10000 + filters: [{ field: "degree", value: "1", values: ["1"] }] + direction: UPSTREAM + } + ) @skip(if: $excludeUpstream) { + start + count + total + facets { + ...facetFields + } + } + downstreams: searchAcrossLineage( + input: { + urn: $urn + query: "*" + start: 0 + count: 10000 + filters: [{ field: "degree", value: "1", values: ["1"] }] + direction: DOWNSTREAM + } + ) @skip(if: $excludeDownstream) { + start + count + total + facets { + ...facetFields + } + } +} \ No newline at end of file diff --git a/datahub-web-react/src/graphql/query.graphql b/datahub-web-react/src/graphql/query.graphql index 84908b24f9ae7f..e24c12a4448b11 100644 --- a/datahub-web-react/src/graphql/query.graphql +++ b/datahub-web-react/src/graphql/query.graphql @@ -1,3 +1,13 @@ +query getQuery($urn: String!) { + entity(urn: $urn) { + urn + type + ... on QueryEntity { + ...query + } + } +} + fragment query on QueryEntity { urn properties { diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl index 3ba19d348913bf..9587775dbed3a8 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl @@ -1,6 +1,7 @@ namespace com.linkedin.query import com.linkedin.common.AuditStamp +import com.linkedin.common.Urn /** * Information about a Query against one or more data assets (e.g. Tables or Views). @@ -22,7 +23,11 @@ record QueryProperties { /** * The query was entered manually by a user (via the UI). */ - MANUAL + MANUAL, + /** + * The query was discovered by a crawler. + */ + SYSTEM } /** From 70e64e80786a2112b3c77d790d9634ee17dd1d34 Mon Sep 17 00:00:00 2001 From: Seokyun Ha Date: Thu, 14 Dec 2023 18:02:37 +0900 Subject: [PATCH 08/17] feat(ingest): add ingest `--no-progress` option (#9300) --- docs/cli.md | 1 + metadata-ingestion/src/datahub/cli/ingest_cli.py | 10 ++++++++++ .../src/datahub/ingestion/run/pipeline.py | 6 +++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/cli.md b/docs/cli.md index 8845ed5a6dac78..cb5077db429061 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -98,6 +98,7 @@ Command Options: --preview-workunits The number of workunits to produce for preview --strict-warnings If enabled, ingestion runs with warnings will yield a non-zero error code --test-source-connection When set, ingestion will only test the source connection details from the recipe + --no-progress If enabled, mute intermediate progress ingestion reports ``` #### ingest --dry-run diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index b7827ec9f050b4..569a836f3ef5c2 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -97,6 +97,13 @@ def ingest() -> None: @click.option( "--no-spinner", type=bool, is_flag=True, default=False, help="Turn off spinner" ) +@click.option( + "--no-progress", + type=bool, + is_flag=True, + default=False, + help="If enabled, mute intermediate progress ingestion reports", +) @telemetry.with_telemetry( capture_kwargs=[ "dry_run", @@ -105,6 +112,7 @@ def ingest() -> None: "test_source_connection", "no_default_report", "no_spinner", + "no_progress", ] ) def run( @@ -117,6 +125,7 @@ def run( report_to: str, no_default_report: bool, no_spinner: bool, + no_progress: bool, ) -> None: """Ingest metadata into DataHub.""" @@ -170,6 +179,7 @@ async def run_ingestion_and_check_upgrade() -> int: preview_workunits, report_to, no_default_report, + no_progress, raw_pipeline_config, ) diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index f2735c24ca19dc..25e17d692109a5 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -173,6 +173,7 @@ def __init__( preview_workunits: int = 10, report_to: Optional[str] = None, no_default_report: bool = False, + no_progress: bool = False, ): self.config = config self.dry_run = dry_run @@ -180,6 +181,7 @@ def __init__( self.preview_workunits = preview_workunits self.report_to = report_to self.reporters: List[PipelineRunListener] = [] + self.no_progress = no_progress self.num_intermediate_workunits = 0 self.last_time_printed = int(time.time()) self.cli_report = CliReport() @@ -330,6 +332,7 @@ def create( preview_workunits: int = 10, report_to: Optional[str] = "datahub", no_default_report: bool = False, + no_progress: bool = False, raw_config: Optional[dict] = None, ) -> "Pipeline": config = PipelineConfig.from_dict(config_dict, raw_config) @@ -340,6 +343,7 @@ def create( preview_workunits=preview_workunits, report_to=report_to, no_default_report=no_default_report, + no_progress=no_progress, ) def _time_to_print(self) -> bool: @@ -379,7 +383,7 @@ def run(self) -> None: self.preview_workunits if self.preview_mode else None, ): try: - if self._time_to_print(): + if self._time_to_print() and not self.no_progress: self.pretty_print_summary(currently_running=True) except Exception as e: logger.warning(f"Failed to print summary {e}") From b0de1dc0ce7a2de221a27f12dfecea9924380ab2 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 14 Dec 2023 18:41:50 +0530 Subject: [PATCH 09/17] fix(powerbi): add access token refresh (#9405) Co-authored-by: elish7lapid Co-authored-by: treff7es --- .../ingestion/source/powerbi/config.py | 1 + .../powerbi/rest_api_wrapper/data_resolver.py | 15 +- .../tests/integration/powerbi/test_powerbi.py | 235 +++++++++++++++--- 3 files changed, 212 insertions(+), 39 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index f71afac737ca61..70786efff79a4d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -95,6 +95,7 @@ class Constant: TITLE = "title" EMBED_URL = "embedUrl" ACCESS_TOKEN = "access_token" + ACCESS_TOKEN_EXPIRY = "expires_in" IS_READ_ONLY = "isReadOnly" WEB_URL = "webUrl" ODATA_COUNT = "@odata.count" diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index c6314c212d104d..3aeffa60bc28e0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -1,6 +1,7 @@ import logging import math from abc import ABC, abstractmethod +from datetime import datetime, timedelta from time import sleep from typing import Any, Dict, List, Optional @@ -59,6 +60,7 @@ def __init__( tenant_id: str, ): self.__access_token: Optional[str] = None + self.__access_token_expiry_time: Optional[datetime] = None self.__tenant_id = tenant_id # Test connection by generating access token logger.info("Trying to connect to {}".format(self._get_authority_url())) @@ -128,7 +130,7 @@ def get_authorization_header(self): return {Constant.Authorization: self.get_access_token()} def get_access_token(self): - if self.__access_token is not None: + if self.__access_token is not None and not self._is_access_token_expired(): return self.__access_token logger.info("Generating PowerBi access token") @@ -150,11 +152,22 @@ def get_access_token(self): self.__access_token = "Bearer {}".format( auth_response.get(Constant.ACCESS_TOKEN) ) + safety_gap = 300 + self.__access_token_expiry_time = datetime.now() + timedelta( + seconds=( + max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0) + ) + ) logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}") return self.__access_token + def _is_access_token_expired(self) -> bool: + if not self.__access_token_expiry_time: + return True + return self.__access_token_expiry_time < datetime.now() + def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: """ Get the list of dashboard from PowerBi for the given workspace identifier diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index c9b0ded4337491..b2cbccf983eb0c 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,8 +1,10 @@ +import datetime import logging import re import sys from typing import Any, Dict, List, cast from unittest import mock +from unittest.mock import MagicMock import pytest from freezegun import freeze_time @@ -31,13 +33,23 @@ def enable_logging(): logging.getLogger().setLevel(logging.DEBUG) -def mock_msal_cca(*args, **kwargs): - class MsalClient: - def acquire_token_for_client(self, *args, **kwargs): - return { - "access_token": "dummy", - } +class MsalClient: + call_num = 0 + token: Dict[str, Any] = { + "access_token": "dummy", + } + + @staticmethod + def acquire_token_for_client(*args, **kwargs): + MsalClient.call_num += 1 + return MsalClient.token + + @staticmethod + def reset(): + MsalClient.call_num = 0 + +def mock_msal_cca(*args, **kwargs): return MsalClient() @@ -627,7 +639,13 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration -def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): +def test_powerbi_ingest( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -658,7 +676,7 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_mces.json", + output_path=f"{tmp_path}/powerbi_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) @@ -667,8 +685,12 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_powerbi_platform_instance_ingest( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -711,8 +733,12 @@ def test_powerbi_platform_instance_ingest( @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_powerbi_ingest_urn_lower_case( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(request_mock=requests_mock) @@ -752,8 +778,12 @@ def test_powerbi_ingest_urn_lower_case( @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_override_ownership( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(request_mock=requests_mock) @@ -783,7 +813,7 @@ def test_override_ownership( mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_mces_disabled_ownership.json", + output_path=f"{tmp_path}/powerbi_mces_disabled_ownership.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) @@ -792,8 +822,13 @@ def test_override_ownership( @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_scan_all_workspaces( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(request_mock=requests_mock) @@ -828,7 +863,7 @@ def test_scan_all_workspaces( mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_mces_scan_all_workspaces.json", + output_path=f"{tmp_path}/powerbi_mces_scan_all_workspaces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) @@ -836,7 +871,14 @@ def test_scan_all_workspaces( @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration -def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): +def test_extract_reports( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -868,7 +910,7 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_ mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_report_mces.json", + output_path=f"{tmp_path}/powerbi_report_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) @@ -876,7 +918,13 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_ @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration -def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): +def test_extract_lineage( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -925,8 +973,12 @@ def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_ @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_extract_endorsements( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(request_mock=requests_mock) @@ -957,7 +1009,7 @@ def test_extract_endorsements( mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_endorsement_mces.json", + output_path=f"{tmp_path}/powerbi_endorsement_mces.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) @@ -966,8 +1018,12 @@ def test_extract_endorsements( @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_admin_access_is_not_allowed( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1024,8 +1080,12 @@ def test_admin_access_is_not_allowed( @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_workspace_container( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1062,11 +1122,92 @@ def test_workspace_container( mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_container_mces.json", + output_path=f"{tmp_path}/powerbi_container_mces.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_access_token_expiry_with_long_expiry( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + enable_logging() + + register_mock_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, + }, + } + ) + + # for long expiry, the token should only be requested once. + MsalClient.token = { + "access_token": "dummy2", + "expires_in": 3600, + } + + MsalClient.reset() + pipeline.run() + # We expect the token to be requested twice (once for AdminApiResolver and one for RegularApiResolver) + assert MsalClient.call_num == 2 + + +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_access_token_expiry_with_short_expiry( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + enable_logging() + + register_mock_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, + }, + } + ) + + # for short expiry, the token should be requested when expires. + MsalClient.token = { + "access_token": "dummy", + "expires_in": 0, + } + pipeline.run() + assert MsalClient.call_num > 2 + + def dataset_type_mapping_set_to_all_platform(pipeline: Pipeline) -> None: source_config: PowerBiDashboardSourceConfig = cast( PowerBiDashboardSource, pipeline.source @@ -1306,8 +1447,12 @@ def validate_pipeline(pipeline: Pipeline) -> None: @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration def test_reports_with_failed_page_request( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: """ Test that all reports are fetched even if a single page request fails """ @@ -1419,8 +1564,12 @@ def test_reports_with_failed_page_request( @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_independent_datasets_extraction( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1503,14 +1652,20 @@ def test_independent_datasets_extraction( mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_independent_mces.json", + output_path=f"{tmp_path}/powerbi_independent_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) -def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): +def test_cll_extraction( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1553,7 +1708,7 @@ def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_m mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_cll_mces.json", + output_path=f"{tmp_path}/powerbi_cll_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) @@ -1561,8 +1716,12 @@ def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_m @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_cll_extraction_flags( - mock_msal, pytestconfig, tmp_path, mock_time, requests_mock -): + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: register_mock_api( request_mock=requests_mock, From 9ecda6485202ce89291bd1485c861cf7be1b8741 Mon Sep 17 00:00:00 2001 From: Sumit Patil <91715217+sumitappt@users.noreply.github.com> Date: Thu, 14 Dec 2023 19:07:48 +0530 Subject: [PATCH 10/17] fix(analytics): do not ping the track endpoint before login (#9462) --- datahub-web-react/src/app/analytics/analytics.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datahub-web-react/src/app/analytics/analytics.ts b/datahub-web-react/src/app/analytics/analytics.ts index a66d76a09cf4de..468164069cfd03 100644 --- a/datahub-web-react/src/app/analytics/analytics.ts +++ b/datahub-web-react/src/app/analytics/analytics.ts @@ -30,16 +30,17 @@ export function getMergedTrackingOptions(options?: any) { export default { page: (data?: PageData, options?: any, callback?: (...params: any[]) => any) => { + const actorUrn = Cookies.get(CLIENT_AUTH_COOKIE) || undefined; const modifiedData = { ...data, type: EventType[EventType.PageViewEvent], - actorUrn: Cookies.get(CLIENT_AUTH_COOKIE) || undefined, + actorUrn, timestamp: Date.now(), date: new Date().toString(), userAgent: navigator.userAgent, browserId: getBrowserId(), }; - if (NODE_ENV === 'test') { + if (NODE_ENV === 'test' || !actorUrn) { return null; } const trackingOptions = getMergedTrackingOptions(options); From aac1c55a14fdf65cb51f1fd0f441d93eb7757098 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:05:06 +0530 Subject: [PATCH 11/17] feat(ingest/unity): enable hive metastore ingestion (#9416) --- metadata-ingestion/setup.py | 5 +- .../ingestion/source/bigquery_v2/bigquery.py | 4 + .../ingestion/source/source_registry.py | 9 + .../datahub/ingestion/source/unity/config.py | 51 +- .../source/unity/hive_metastore_proxy.py | 242 ++ .../datahub/ingestion/source/unity/proxy.py | 22 + .../ingestion/source/unity/proxy_types.py | 38 +- .../datahub/ingestion/source/unity/report.py | 4 +- .../datahub/ingestion/source/unity/source.py | 64 +- .../unity/test_unity_catalog_ingest.py | 77 +- .../unity/unity_catalog_mces_golden.json | 2509 +++++++++-------- .../tests/unit/test_unity_catalog_config.py | 65 +- 12 files changed, 1958 insertions(+), 1132 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e894cbf043338d..5d15d7167b63e8 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -263,7 +263,8 @@ "pyspark~=3.3.0", "requests", # Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes - "databricks-sql-connector>=2.8.0", + # Version 3.0.0 required SQLAlchemy > 2.0.21 + "databricks-sql-connector>=2.8.0,<3.0.0", } mysql = sql_common | {"pymysql>=1.0.2"} @@ -395,6 +396,8 @@ "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"}, "unity-catalog": databricks | sql_common | sqllineage_lib, + # databricks is alias for unity-catalog and needs to be kept in sync + "databricks": databricks | sql_common | sqllineage_lib, "fivetran": snowflake_common, } diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 6959a483130106..9813945683289c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -1031,6 +1031,10 @@ def gen_dataset_urn_from_ref(self, ref: BigQueryTableRef) -> str: def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: schema_fields: List[SchemaField] = [] + # Below line affects HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR in global scope + # TODO: Refractor this such that + # converter = HiveColumnToAvroConverter(struct_type_separator=" "); + # converter.get_schema_fields_for_hive_column(...) HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR = " " _COMPLEX_TYPE = re.compile("^(struct|array)") last_id = -1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py index c3fbab3f9a0122..e003c658f45e8d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py @@ -14,3 +14,12 @@ "mssql-odbc", "mssql", ) + +# Use databricks as alias for unity-catalog ingestion source. +# As mentioned here - https://docs.databricks.com/en/data-governance/unity-catalog/enable-workspaces.html, +# Databricks is rolling out Unity Catalog gradually across accounts. +# TODO: Rename unity-catalog source to databricks source, once it is rolled out for all accounts +source_registry.register_alias( + "databricks", + "unity-catalog", +) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 2c567120b4850e..96971faeea69f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -129,6 +129,14 @@ class UnityCatalogSourceConfig( workspace_url: str = pydantic.Field( description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com" ) + warehouse_id: Optional[str] = pydantic.Field( + default=None, + description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.", + ) + include_hive_metastore: bool = pydantic.Field( + default=False, + description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.", + ) workspace_name: Optional[str] = pydantic.Field( default=None, description="Name of the workspace. Default to deployment name present in workspace_url", @@ -254,16 +262,17 @@ class UnityCatalogSourceConfig( scheme: str = DATABRICKS - def get_sql_alchemy_url(self): + def get_sql_alchemy_url(self, database: Optional[str] = None) -> str: + uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"} + if database: + uri_opts["catalog"] = database return make_sqlalchemy_uri( scheme=self.scheme, username="token", password=self.token, at=urlparse(self.workspace_url).netloc, - db=None, - uri_opts={ - "http_path": f"/sql/1.0/warehouses/{self.profiling.warehouse_id}" - }, + db=database, + uri_opts=uri_opts, ) def is_profiling_enabled(self) -> bool: @@ -304,3 +313,35 @@ def include_metastore_warning(cls, v: bool) -> bool: logger.warning(msg) add_global_warning(msg) return v + + @pydantic.root_validator(skip_on_failure=True) + def set_warehouse_id_from_profiling(cls, values: Dict[str, Any]) -> Dict[str, Any]: + profiling: Optional[UnityCatalogProfilerConfig] = values.get("profiling") + if not values.get("warehouse_id") and profiling and profiling.warehouse_id: + values["warehouse_id"] = profiling.warehouse_id + if ( + values.get("warehouse_id") + and profiling + and profiling.warehouse_id + and values["warehouse_id"] != profiling.warehouse_id + ): + raise ValueError( + "When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`." + ) + + if values.get("include_hive_metastore") and not values.get("warehouse_id"): + raise ValueError( + "When `include_hive_metastore` is set, `warehouse_id` must be set." + ) + + if values.get("warehouse_id") and profiling and not profiling.warehouse_id: + profiling.warehouse_id = values["warehouse_id"] + + return values + + @pydantic.validator("schema_pattern", always=True) + def schema_pattern_should__always_deny_information_schema( + cls, v: AllowDenyPattern + ) -> AllowDenyPattern: + v.deny.append(".*\\.information_schema") + return v diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py new file mode 100644 index 00000000000000..99b2ff998662cb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py @@ -0,0 +1,242 @@ +import logging +from datetime import datetime +from functools import lru_cache +from typing import Iterable, List, Optional + +from databricks.sdk.service.catalog import ColumnTypeName, DataSourceFormat +from databricks.sql.types import Row +from sqlalchemy import create_engine, inspect +from sqlalchemy.engine.reflection import Inspector + +from datahub.ingestion.api.closeable import Closeable +from datahub.ingestion.source.unity.proxy_types import ( + Catalog, + Column, + CustomCatalogType, + HiveTableType, + Metastore, + Schema, + Table, +) + +logger = logging.getLogger(__name__) +HIVE_METASTORE = "hive_metastore" + +type_map = { + "boolean": ColumnTypeName.BOOLEAN, + "tinyint": ColumnTypeName.INT, + "smallint": ColumnTypeName.INT, + "int": ColumnTypeName.INT, + "bigint": ColumnTypeName.LONG, + "float": ColumnTypeName.FLOAT, + "double": ColumnTypeName.DOUBLE, + "decimal": ColumnTypeName.DECIMAL, + "string": ColumnTypeName.STRING, + "varchar": ColumnTypeName.STRING, + "timestamp": ColumnTypeName.TIMESTAMP, + "date": ColumnTypeName.DATE, + "binary": ColumnTypeName.BINARY, +} + + +class HiveMetastoreProxy(Closeable): + # TODO: Support for view lineage using SQL parsing + # Why not use hive ingestion source directly here ? + # 1. hive ingestion source assumes 2-level namespace heirarchy and currently + # there is no other intermediate interface except sqlalchemy inspector + # that can be used to fetch hive metadata. + # 2. hive recipe for databricks (databricks+pyhive dialect) does not + # readily support SQL warehouse. Also this dialect is not actively maintained. + """ + Proxy to read metadata from hive_metastore databricks catalog. This is required + as unity catalog apis do not return details about this legacy metastore. + """ + + def __init__(self, sqlalchemy_url: str, options: dict) -> None: + try: + self.inspector = HiveMetastoreProxy.get_inspector(sqlalchemy_url, options) + except Exception: + # This means that there is no `hive_metastore` catalog in databricks workspace + # Not tested but seems like the logical conclusion. + raise + + @staticmethod + def get_inspector(sqlalchemy_url: str, options: dict) -> Inspector: + engine = create_engine(sqlalchemy_url, **options) + return inspect(engine.connect()) + + def hive_metastore_catalog(self, metastore: Optional[Metastore]) -> Catalog: + return Catalog( + id=HIVE_METASTORE, + name=HIVE_METASTORE, + comment=None, + metastore=metastore, + owner=None, + type=CustomCatalogType.HIVE_METASTORE_CATALOG, + ) + + def hive_metastore_schemas(self, catalog: Catalog) -> Iterable[Schema]: + for schema_name in self.inspector.get_schema_names(): + yield Schema( + name=schema_name, + id=f"{catalog.id}.{schema_name}", + catalog=catalog, + comment=None, + owner=None, + ) + + def hive_metastore_tables(self, schema: Schema) -> Iterable[Table]: + views = self.inspector.get_view_names(schema.name) + for table_name in views: + yield self._get_table(schema, table_name, True) + + for table_name in self.inspector.get_table_names(schema.name): + if table_name in views: + continue + yield self._get_table(schema, table_name, False) + + def _get_table(self, schema: Schema, table_name: str, is_view: bool) -> Table: + columns = self._get_columns(schema, table_name) + detailed_info = self._get_table_info(schema, table_name) + + comment = detailed_info.pop("Comment", None) + storage_location = detailed_info.pop("Location", None) + datasource_format = self._get_datasource_format( + detailed_info.pop("Provider", None) + ) + + created_at = self._get_created_at(detailed_info.pop("Created Time", None)) + + return Table( + name=table_name, + id=f"{schema.id}.{table_name}", + table_type=self._get_table_type(detailed_info.pop("Type", None)), + schema=schema, + columns=columns, + storage_location=storage_location, + data_source_format=datasource_format, + view_definition=self._get_view_definition(schema.name, table_name) + if is_view + else None, + properties=detailed_info, + owner=None, + generation=None, + created_at=created_at, + created_by=None, + updated_at=None, + updated_by=None, + table_id=f"{schema.id}.{table_name}", + comment=comment, + ) + + def _get_created_at(self, created_at: Optional[str]) -> Optional[datetime]: + return ( + datetime.strptime(created_at, "%a %b %d %H:%M:%S %Z %Y") + if created_at + else None + ) + + def _get_datasource_format( + self, provider: Optional[str] + ) -> Optional[DataSourceFormat]: + raw_format = provider + if raw_format: + try: + return DataSourceFormat(raw_format.upper()) + except Exception: + logger.debug(f"Unknown datasource format : {raw_format}") + pass + return None + + def _get_view_definition(self, schema_name: str, table_name: str) -> Optional[str]: + try: + rows = self._execute_sql( + f"SHOW CREATE TABLE `{schema_name}`.`{table_name}`" + ) + for row in rows: + return row[0] + except Exception: + logger.debug( + f"Failed to get view definition for {schema_name}.{table_name}" + ) + return None + + def _get_table_type(self, type: Optional[str]) -> HiveTableType: + if type == "EXTERNAL": + return HiveTableType.HIVE_EXTERNAL_TABLE + elif type == "MANAGED": + return HiveTableType.HIVE_MANAGED_TABLE + elif type == "VIEW": + return HiveTableType.HIVE_VIEW + else: + return HiveTableType.UNKNOWN + + def _get_table_info(self, schema: Schema, table_name: str) -> dict: + rows = self._describe_extended(schema.name, table_name) + + index = rows.index(("# Detailed Table Information", "", "")) + rows = rows[index + 1 :] + # Copied from https://github.com/acryldata/PyHive/blob/master/pyhive/sqlalchemy_hive.py#L375 + # Generate properties dictionary. + properties = {} + active_heading = None + for col_name, data_type, value in rows: + col_name = col_name.rstrip() + if col_name.startswith("# "): + continue + elif col_name == "" and data_type is None: + active_heading = None + continue + elif col_name != "" and data_type is None: + active_heading = col_name + elif col_name != "" and data_type is not None: + properties[col_name] = data_type.strip() + else: + # col_name == "", data_type is not None + prop_name = "{} {}".format(active_heading, data_type.rstrip()) + properties[prop_name] = value.rstrip() + + return properties + + def _get_columns(self, schema: Schema, table_name: str) -> List[Column]: + rows = self._describe_extended(schema.name, table_name) + + columns: List[Column] = [] + for i, row in enumerate(rows): + if i == 0 and row[0].strip() == "col_name": + continue # first row + if row[0].strip() in ( + "", + "# Partition Information", + "# Detailed Table Information", + ): + break + columns.append( + Column( + name=row[0].strip(), + id=f"{schema.id}.{table_name}.{row[0].strip()}", + type_text=row[1].strip(), + type_name=type_map.get(row[1].strip().lower()), + type_scale=None, + type_precision=None, + position=None, + nullable=None, + comment=row[2], + ) + ) + + return columns + + @lru_cache(maxsize=1) + def _describe_extended(self, schema_name: str, table_name: str) -> List[Row]: + """ + Rows are structured as shown in examples here + https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-describe-table.html#examples + """ + return self._execute_sql(f"DESCRIBE EXTENDED `{schema_name}`.`{table_name}`") + + def _execute_sql(self, sql: str) -> List[Row]: + return self.inspector.bind.execute(sql).fetchall() + + def close(self): + self.inspector.bind.close() # type:ignore diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 375c76db8e9719..13baa8b57a639d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -26,6 +26,7 @@ from databricks.sdk.service.workspace import ObjectType import datahub +from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy from datahub.ingestion.source.unity.proxy_profiling import ( UnityCatalogProxyProfilingMixin, ) @@ -33,6 +34,7 @@ ALLOWED_STATEMENT_TYPES, Catalog, Column, + CustomCatalogType, ExternalTableReference, Metastore, Notebook, @@ -87,6 +89,7 @@ def __init__( personal_access_token: str, warehouse_id: Optional[str], report: UnityCatalogReport, + hive_metastore_proxy: Optional[HiveMetastoreProxy] = None, ): self._workspace_client = WorkspaceClient( host=workspace_url, @@ -96,6 +99,7 @@ def __init__( ) self.warehouse_id = warehouse_id or "" self.report = report + self.hive_metastore_proxy = hive_metastore_proxy def check_basic_connectivity(self) -> bool: return bool(self._workspace_client.catalogs.list()) @@ -105,6 +109,9 @@ def assigned_metastore(self) -> Metastore: return self._create_metastore(response) def catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]: + if self.hive_metastore_proxy: + yield self.hive_metastore_proxy.hive_metastore_catalog(metastore) + response = self._workspace_client.catalogs.list() if not response: logger.info("Catalogs not found") @@ -122,6 +129,12 @@ def catalog( return self._create_catalog(metastore, response) def schemas(self, catalog: Catalog) -> Iterable[Schema]: + if ( + self.hive_metastore_proxy + and catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG + ): + yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog) + return response = self._workspace_client.schemas.list(catalog_name=catalog.name) if not response: logger.info(f"Schemas not found for catalog {catalog.id}") @@ -130,6 +143,12 @@ def schemas(self, catalog: Catalog) -> Iterable[Schema]: yield self._create_schema(catalog, schema) def tables(self, schema: Schema) -> Iterable[Table]: + if ( + self.hive_metastore_proxy + and schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG + ): + yield from self.hive_metastore_proxy.hive_metastore_tables(schema) + return with patch("databricks.sdk.service.catalog.TableInfo", TableInfoWithGeneration): response = self._workspace_client.tables.list( catalog_name=schema.catalog.name, schema_name=schema.name @@ -244,6 +263,9 @@ def list_lineages_by_column(self, table_name: str, column_name: str) -> dict: ) def table_lineage(self, table: Table, include_entity_lineage: bool) -> None: + if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG: + # Lineage is not available for Hive Metastore Tables. + return None # Lineage endpoint doesn't exists on 2.1 version try: response: dict = self.list_lineages_by_table( diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index 315c1c0d20186f..e5951cb0fa4ffc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -4,7 +4,8 @@ import logging from dataclasses import dataclass, field from datetime import datetime -from typing import Dict, FrozenSet, List, Optional, Set +from enum import Enum +from typing import Dict, FrozenSet, List, Optional, Set, Union from databricks.sdk.service.catalog import ( CatalogType, @@ -75,6 +76,17 @@ NotebookId = int +class CustomCatalogType(Enum): + HIVE_METASTORE_CATALOG = "HIVE_METASTORE_CATALOG" + + +class HiveTableType(Enum): + HIVE_MANAGED_TABLE = "HIVE_MANAGED_TABLE" + HIVE_EXTERNAL_TABLE = "HIVE_EXTERNAL_TABLE" + HIVE_VIEW = "HIVE_VIEW" + UNKNOWN = "UNKNOWN" + + @dataclass class CommonProperty: id: str @@ -95,7 +107,7 @@ class Metastore(CommonProperty): class Catalog(CommonProperty): metastore: Optional[Metastore] owner: Optional[str] - type: CatalogType + type: Union[CatalogType, CustomCatalogType] @dataclass @@ -107,11 +119,11 @@ class Schema(CommonProperty): @dataclass class Column(CommonProperty): type_text: str - type_name: ColumnTypeName - type_precision: int - type_scale: int - position: int - nullable: bool + type_name: Optional[ColumnTypeName] + type_precision: Optional[int] + type_scale: Optional[int] + position: Optional[int] + nullable: Optional[bool] comment: Optional[str] @@ -212,11 +224,11 @@ class Table(CommonProperty): columns: List[Column] storage_location: Optional[str] data_source_format: Optional[DataSourceFormat] - table_type: TableType + table_type: Union[TableType, HiveTableType] owner: Optional[str] generation: Optional[int] - created_at: datetime - created_by: str + created_at: Optional[datetime] + created_by: Optional[str] updated_at: Optional[datetime] updated_by: Optional[str] table_id: str @@ -231,7 +243,11 @@ class Table(CommonProperty): def __post_init__(self): self.ref = TableReference.create(self) - self.is_view = self.table_type in [TableType.VIEW, TableType.MATERIALIZED_VIEW] + self.is_view = self.table_type in [ + TableType.VIEW, + TableType.MATERIALIZED_VIEW, + HiveTableType.HIVE_VIEW, + ] @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index 7f19b6e2103ea9..0770d9d27055c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import Tuple +from typing import Optional, Tuple from datahub.ingestion.api.report import EntityFilterReport from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport @@ -16,6 +16,8 @@ class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport): table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile") notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook") + hive_metastore_catalog_found: Optional[bool] = None + num_column_lineage_skipped_column_count: int = 0 num_external_upstreams_lacking_permissions: int = 0 num_external_upstreams_unsupported: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index d1940c1d576073..43c5e244393772 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -58,6 +58,10 @@ ) from datahub.ingestion.source.unity.connection_test import UnityCatalogConnectionTest from datahub.ingestion.source.unity.ge_profiler import UnityCatalogGEProfiler +from datahub.ingestion.source.unity.hive_metastore_proxy import ( + HIVE_METASTORE, + HiveMetastoreProxy, +) from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy from datahub.ingestion.source.unity.proxy_types import ( DATA_TYPE_REGISTRY, @@ -142,12 +146,17 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): self.config = config self.report: UnityCatalogReport = UnityCatalogReport() + + self.init_hive_metastore_proxy() + self.unity_catalog_api_proxy = UnityCatalogApiProxy( config.workspace_url, config.token, - config.profiling.warehouse_id, + config.warehouse_id, report=self.report, + hive_metastore_proxy=self.hive_metastore_proxy, ) + self.external_url_base = urljoin(self.config.workspace_url, "/explore/data") # Determine the platform_instance_name @@ -174,6 +183,23 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): # Global map of tables, for profiling self.tables: FileBackedDict[Table] = FileBackedDict() + def init_hive_metastore_proxy(self): + self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None + if self.config.include_hive_metastore: + try: + self.hive_metastore_proxy = HiveMetastoreProxy( + self.config.get_sql_alchemy_url(HIVE_METASTORE), self.config.options + ) + self.report.hive_metastore_catalog_found = True + except Exception as e: + logger.debug("Exception", exc_info=True) + self.warn( + logger, + HIVE_METASTORE, + f"Failed to connect to hive_metastore due to {e}", + ) + self.report.hive_metastore_catalog_found = False + @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: return UnityCatalogConnectionTest(config_dict).get_connection_test() @@ -194,7 +220,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.report_ingestion_stage_start("Ingestion Setup") wait_on_warehouse = None - if self.config.is_profiling_enabled(): + if self.config.is_profiling_enabled() or self.config.include_hive_metastore: self.report.report_ingestion_stage_start("Start warehouse") # Can take several minutes, so start now and wait later wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() @@ -204,6 +230,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: f"SQL warehouse {self.config.profiling.warehouse_id} not found", ) return + else: + # wait until warehouse is started + wait_on_warehouse.result() if self.config.include_ownership: self.report.report_ingestion_stage_start("Ingest service principals") @@ -678,18 +707,25 @@ def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass: custom_properties["table_type"] = table.table_type.value - custom_properties["created_by"] = table.created_by - custom_properties["created_at"] = str(table.created_at) + if table.created_by: + custom_properties["created_by"] = table.created_by if table.properties: custom_properties.update({k: str(v) for k, v in table.properties.items()}) custom_properties["table_id"] = table.table_id - custom_properties["owner"] = table.owner - custom_properties["updated_by"] = table.updated_by - custom_properties["updated_at"] = str(table.updated_at) - - created = TimeStampClass( - int(table.created_at.timestamp() * 1000), make_user_urn(table.created_by) - ) + if table.owner: + custom_properties["owner"] = table.owner + if table.updated_by: + custom_properties["updated_by"] = table.updated_by + if table.updated_at: + custom_properties["updated_at"] = str(table.updated_at) + + created: Optional[TimeStampClass] = None + if table.created_at: + custom_properties["created_at"] = str(table.created_at) + created = TimeStampClass( + int(table.created_at.timestamp() * 1000), + make_user_urn(table.created_by) if table.created_by else None, + ) last_modified = created if table.updated_at: last_modified = TimeStampClass( @@ -780,3 +816,9 @@ def _create_schema_field(column: Column) -> List[SchemaFieldClass]: description=column.comment, ) ] + + def close(self): + if self.hive_metastore_proxy: + self.hive_metastore_proxy.close() + + super().close() diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py index c43ba7eee58478..aab7630d57f460 100644 --- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py +++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py @@ -3,6 +3,7 @@ from unittest.mock import patch import databricks +import pytest from databricks.sdk.service.catalog import ( CatalogInfo, GetMetastoreSummaryResponse, @@ -12,12 +13,15 @@ from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy from tests.test_helpers import mce_helpers FROZEN_TIME = "2021-12-07 07:00:00" SERVICE_PRINCIPAL_ID_1 = str(uuid.uuid4()) SERVICE_PRINCIPAL_ID_2 = str(uuid.uuid4()) +pytestmark = pytest.mark.integration_batch_1 + def register_mock_api(request_mock): api_vs_response = { @@ -215,6 +219,65 @@ def register_mock_data(workspace_client): ] +def mock_hive_sql(query): + if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet`": + return [ + ("betStatusId", "bigint", None), + ("channelId", "bigint", None), + ( + "combination", + "struct>,eventId:bigint,eventName:string,eventStartDate:string,live:boolean,odds:double,outcomeIds:array,outcomeLabel:string,sportId:string,status:string,voidReason:string>>,payout:double,rewardExtraPayout:double,stake:double>", + None, + ), + ("", "", ""), + ("# Detailed Table Information", "", ""), + ("Catalog", "hive_metastore", ""), + ("Database", "bronze_kambi", ""), + ("Table", "bet", ""), + ("Created Time", "Wed Jun 22 05:14:56 UTC 2022", ""), + ("Last Access", "UNKNOWN", ""), + ("Created By", "Spark 3.2.1", ""), + ("Type", "MANAGED", ""), + ("Location", "dbfs:/user/hive/warehouse/bronze_kambi.db/bet", ""), + ("Provider", "delta", ""), + ("Owner", "root", ""), + ("Is_managed_location", "true", ""), + ( + "Table Properties", + "[delta.autoOptimize.autoCompact=true,delta.autoOptimize.optimizeWrite=true,delta.minReaderVersion=1,delta.minWriterVersion=2]", + "", + ), + ] + elif query == "DESCRIBE EXTENDED `bronze_kambi`.`view1`": + return [ + ("betStatusId", "bigint", None), + ("channelId", "bigint", None), + ( + "combination", + "struct>,eventId:bigint,eventName:string,eventStartDate:string,live:boolean,odds:double,outcomeIds:array,outcomeLabel:string,sportId:string,status:string,voidReason:string>>,payout:double,rewardExtraPayout:double,stake:double>", + None, + ), + ("", "", ""), + ("# Detailed Table Information", "", ""), + ("Catalog", "hive_metastore", ""), + ("Database", "bronze_kambi", ""), + ("Table", "view1", ""), + ("Created Time", "Wed Jun 22 05:14:56 UTC 2022", ""), + ("Last Access", "UNKNOWN", ""), + ("Created By", "Spark 3.2.1", ""), + ("Type", "VIEW", ""), + ("Owner", "root", ""), + ] + elif query == "SHOW CREATE TABLE `bronze_kambi`.`view1`": + return [ + ( + "CREATE VIEW `hive_metastore`.`bronze_kambi`.`view1` AS SELECT * FROM `hive_metastore`.`bronze_kambi`.`bet`", + ) + ] + + return [] + + @freeze_time(FROZEN_TIME) def test_ingestion(pytestconfig, tmp_path, requests_mock): test_resources_dir = pytestconfig.rootpath / "tests/integration/unity" @@ -223,11 +286,21 @@ def test_ingestion(pytestconfig, tmp_path, requests_mock): output_file_name = "unity_catalog_mcps.json" - with patch("databricks.sdk.WorkspaceClient") as WorkspaceClient: + with patch("databricks.sdk.WorkspaceClient") as WorkspaceClient, patch.object( + HiveMetastoreProxy, "get_inspector" + ) as get_inspector, patch.object(HiveMetastoreProxy, "_execute_sql") as execute_sql: workspace_client: mock.MagicMock = mock.MagicMock() WorkspaceClient.return_value = workspace_client register_mock_data(workspace_client) + inspector = mock.MagicMock() + inspector.get_schema_names.return_value = ["bronze_kambi"] + inspector.get_view_names.return_value = ["view1"] + inspector.get_table_names.return_value = ["bet", "view1"] + get_inspector.return_value = inspector + + execute_sql.side_effect = mock_hive_sql + config_dict: dict = { "run_id": "unity-catalog-test", "pipeline_name": "unity-catalog-test-pipeline", @@ -237,6 +310,8 @@ def test_ingestion(pytestconfig, tmp_path, requests_mock): "workspace_url": "https://dummy.cloud.databricks.com", "token": "fake", "include_ownership": True, + "include_hive_metastore": True, + "warehouse_id": "test", }, }, "sink": { diff --git a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json index d25c86a3a1f9a3..98a6615dd2b52c 100644 --- a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json +++ b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json @@ -114,7 +114,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -123,11 +123,10 @@ "platform": "databricks", "env": "PROD", "metastore": "acryl metastore", - "catalog": "main" + "catalog": "hive_metastore" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main", - "name": "main", - "description": "Main catalog (auto-created)" + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore", + "name": "hive_metastore" } }, "systemMetadata": { @@ -138,7 +137,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -156,10 +155,18 @@ "entityType": "container", "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "containerProperties", "aspect": { "json": { - "platform": "urn:li:dataPlatform:databricks" + "customProperties": { + "platform": "databricks", + "env": "PROD", + "metastore": "acryl metastore", + "catalog": "main" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main", + "name": "main", + "description": "Main catalog (auto-created)" } }, "systemMetadata": { @@ -170,7 +177,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -188,21 +195,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "container", "aspect": { "json": { - "owners": [ - { - "owner": "urn:li:corpuser:account users", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } + "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" } }, "systemMetadata": { @@ -213,12 +211,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + "platform": "urn:li:dataPlatform:databricks" } }, "systemMetadata": { @@ -229,7 +227,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -250,32 +248,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "databricks", - "env": "PROD", - "metastore": "acryl metastore", - "catalog": "main", - "unity_schema": "default" - }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default", - "name": "default", - "description": "Default schema (auto-created)" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -291,7 +264,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -307,13 +280,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Schema" + "Catalog" ] } }, @@ -325,14 +298,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { "json": { "owners": [ { - "owner": "urn:li:corpuser:abc@acryl.io", + "owner": "urn:li:corpuser:account users", "type": "DATAOWNER" } ], @@ -350,12 +323,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" + "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" } }, "systemMetadata": { @@ -366,21 +339,20 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "containerProperties", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", - "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" - }, - { - "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", - "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" - } - ] + "customProperties": { + "platform": "databricks", + "env": "PROD", + "metastore": "acryl metastore", + "catalog": "hive_metastore", + "unity_schema": "bronze_kambi" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi", + "name": "bronze_kambi" } }, "systemMetadata": { @@ -390,13 +362,13 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:5ada0a9773235325e506410c512feabb" + "removed": false } }, "systemMetadata": { @@ -406,40 +378,18 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896", - "data_source_format": "DELTA", - "generation": "2", - "table_type": "MANAGED", - "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", - "delta.lastCommitTimestamp": "1666185711000", - "delta.lastUpdateVersion": "1", - "delta.minReaderVersion": "1", - "delta.minWriterVersion": "2", - "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", - "owner": "account users", - "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" - }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default/quickstart_table", - "name": "quickstart_table", - "qualifiedName": "main.default.quickstart_table", - "created": { - "time": 1666185698688, - "actor": "urn:li:corpuser:abc@acryl.io" - }, - "lastModified": { - "time": 1666186049633, - "actor": "urn:li:corpuser:abc@acryl.io" - }, - "tags": [] + "path": [ + { + "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", + "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + } + ] } }, "systemMetadata": { @@ -449,14 +399,14 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Table" + "Schema" ] } }, @@ -467,55 +417,13 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", "changeType": "UPSERT", - "aspectName": "schemaMetadata", + "aspectName": "container", "aspect": { "json": { - "schemaName": "acryl_metastore.main.default.quickstart_table", - "platform": "urn:li:dataPlatform:databricks", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "columnA", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "columnB", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - } - ] + "container": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202" } }, "systemMetadata": { @@ -525,22 +433,13 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "owners": [ - { - "owner": "urn:li:corpuser:account users", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } + "platform": "urn:li:dataPlatform:databricks" } }, "systemMetadata": { @@ -551,7 +450,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -562,12 +477,8 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", - "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" - }, - { - "id": "urn:li:container:5ada0a9773235325e506410c512feabb", - "urn": "urn:li:container:5ada0a9773235325e506410c512feabb" + "id": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", + "urn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202" } ] } @@ -579,22 +490,33 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "datasetProperties", "aspect": { "json": { "customProperties": { - "platform": "databricks", - "env": "PROD", - "metastore": "acryl metastore", - "catalog": "main", - "unity_schema": "information_schema" + "table_type": "HIVE_VIEW", + "Catalog": "hive_metastore", + "Database": "bronze_kambi", + "Table": "view1", + "Last Access": "UNKNOWN", + "Created By": "Spark 3.2.1", + "Owner": "root", + "table_id": "hive_metastore.bronze_kambi.view1", + "created_at": "2022-06-22 05:14:56" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi/view1", + "name": "view1", + "qualifiedName": "hive_metastore.bronze_kambi.view1", + "created": { + "time": 1655874896000 }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/information_schema", - "name": "information_schema", - "description": "Information schema (auto-created)" + "lastModified": { + "time": 1655874896000 + }, + "tags": [] } }, "systemMetadata": { @@ -604,13 +526,15 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "viewProperties", "aspect": { "json": { - "removed": false + "materialized": false, + "viewLogic": "CREATE VIEW `hive_metastore`.`bronze_kambi`.`view1` AS SELECT * FROM `hive_metastore`.`bronze_kambi`.`bet`", + "viewLanguage": "SQL" } }, "systemMetadata": { @@ -621,13 +545,22 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "containerProperties", "aspect": { "json": { - "platform": "urn:li:dataPlatform:databricks" - } + "customProperties": { + "platform": "databricks", + "env": "PROD", + "metastore": "acryl metastore", + "catalog": "main", + "unity_schema": "default" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default", + "name": "default", + "description": "Default schema (auto-created)" + } }, "systemMetadata": { "lastObserved": 1638860400000, @@ -636,14 +569,14 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Schema" + "View" ] } }, @@ -654,49 +587,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ - { - "owner": "urn:li:corpuser:Service Principal 1", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -707,8 +599,12 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", - "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" + "id": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", + "urn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202" + }, + { + "id": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", + "urn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e" } ] } @@ -720,74 +616,13 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": { - "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896", - "data_source_format": "DELTA", - "generation": "2", - "table_type": "MANAGED", - "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", - "delta.lastCommitTimestamp": "1666185711000", - "delta.lastUpdateVersion": "1", - "delta.minReaderVersion": "1", - "delta.minWriterVersion": "2", - "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", - "owner": "account users", - "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" - }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/information_schema/quickstart_table", - "name": "quickstart_table", - "qualifiedName": "main.information_schema.quickstart_table", - "created": { - "time": 1666185698688, - "actor": "urn:li:corpuser:abc@acryl.io" - }, - "lastModified": { - "time": 1666186049633, - "actor": "urn:li:corpuser:abc@acryl.io" - }, - "tags": [] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "status", "aspect": { "json": { - "typeNames": [ - "Table" - ] + "removed": false } }, "systemMetadata": { @@ -798,12 +633,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "acryl_metastore.main.information_schema.quickstart_table", + "schemaName": "hive_metastore.bronze_kambi.view1", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -822,144 +657,409 @@ }, "fields": [ { - "fieldPath": "columnA", - "nullable": true, + "fieldPath": "betStatusId", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "int", + "nativeDataType": "bigint", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "columnB", - "nullable": true, + "fieldPath": "channelId", + "nullable": false, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "bigint", "recursive": false, "isPartOfKey": false - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ + }, { - "owner": "urn:li:corpuser:account users", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.RecordType": {} + } + }, + "nativeDataType": "struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>\"}" + }, { - "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", - "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=long].combinationref", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" }, { - "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", - "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].currentodds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" }, { - "id": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6", - "urn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "databricks", - "env": "PROD", - "metastore": "acryl metastore", - "catalog": "main", - "unity_schema": "quickstart_schema" - }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema", - "name": "quickstart_schema", - "description": "A new Unity Catalog schema called quickstart_schema" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:databricks" - } - }, - "systemMetadata": { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].eachway", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].livebetting", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].odds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].betoffertypeid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].criterionid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].criterionname", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].currentodds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventgroupid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=long].id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=string].name", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventname", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventstartdate", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=boolean].live", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].odds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=long].outcomeids", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": { + "nestedType": [ + "long" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].outcomelabel", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].sportid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].status", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].voidreason", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].payout", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].rewardextrapayout", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].stake", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + } + ] + } + }, + "systemMetadata": { "lastObserved": 1638860400000, "runId": "unity-catalog-test", "lastRunId": "no-run-id-provided" @@ -967,7 +1067,23 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:databricks" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -985,14 +1101,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { "json": { "owners": [ { - "owner": "urn:li:corpuser:account users", + "owner": "urn:li:corpuser:abc@acryl.io", "type": "DATAOWNER" } ], @@ -1009,13 +1125,13 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" + "container": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e" } }, "systemMetadata": { @@ -1026,37 +1142,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", - "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" - }, - { - "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", - "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:481380c5a355638fc626eca8380cdda9" + "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" } }, "systemMetadata": { @@ -1067,37 +1158,34 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { "json": { "customProperties": { - "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896", + "storage_location": "dbfs:/user/hive/warehouse/bronze_kambi.db/bet", "data_source_format": "DELTA", - "generation": "2", - "table_type": "MANAGED", - "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", - "delta.lastCommitTimestamp": "1666185711000", - "delta.lastUpdateVersion": "1", - "delta.minReaderVersion": "1", - "delta.minWriterVersion": "2", - "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", - "owner": "account users", - "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" - }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema/quickstart_table", - "name": "quickstart_table", - "qualifiedName": "main.quickstart_schema.quickstart_table", + "table_type": "HIVE_MANAGED_TABLE", + "Catalog": "hive_metastore", + "Database": "bronze_kambi", + "Table": "bet", + "Last Access": "UNKNOWN", + "Created By": "Spark 3.2.1", + "Owner": "root", + "Is_managed_location": "true", + "Table Properties": "[delta.autoOptimize.autoCompact=true,delta.autoOptimize.optimizeWrite=true,delta.minReaderVersion=1,delta.minWriterVersion=2]", + "table_id": "hive_metastore.bronze_kambi.bet", + "created_at": "2022-06-22 05:14:56" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi/bet", + "name": "bet", + "qualifiedName": "hive_metastore.bronze_kambi.bet", "created": { - "time": 1666185698688, - "actor": "urn:li:corpuser:abc@acryl.io" + "time": 1655874896000 }, "lastModified": { - "time": 1666186049633, - "actor": "urn:li:corpuser:abc@acryl.io" + "time": 1655874896000 }, "tags": [] } @@ -1110,7 +1198,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1127,53 +1215,20 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb", "changeType": "UPSERT", - "aspectName": "schemaMetadata", + "aspectName": "browsePathsV2", "aspect": { "json": { - "schemaName": "acryl_metastore.main.quickstart_schema.quickstart_table", - "platform": "urn:li:dataPlatform:databricks", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ + "path": [ { - "fieldPath": "columnA", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false + "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", + "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "fieldPath": "columnB", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false + "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" } ] } @@ -1186,32 +1241,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ - { - "owner": "urn:li:corpuser:account users", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1222,12 +1252,12 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", - "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" + "id": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202", + "urn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202" }, { - "id": "urn:li:container:481380c5a355638fc626eca8380cdda9", - "urn": "urn:li:container:481380c5a355638fc626eca8380cdda9" + "id": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e", + "urn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e" } ] } @@ -1239,272 +1269,429 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "schemaMetadata", "aspect": { "json": { - "customProperties": { - "platform": "databricks", - "env": "PROD", - "metastore": "acryl metastore", - "catalog": "quickstart_catalog" + "schemaName": "hive_metastore.bronze_kambi.bet", + "platform": "urn:li:dataPlatform:databricks", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog", - "name": "quickstart_catalog", - "description": "" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:databricks" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Catalog" - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ - { - "owner": "urn:li:corpuser:account users", - "type": "DATAOWNER" - } - ], "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", - "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "databricks", - "env": "PROD", - "metastore": "acryl metastore", - "catalog": "quickstart_catalog", - "unity_schema": "default" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default", - "name": "default", - "description": "Default schema (auto-created)" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:databricks" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Schema" - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "betStatusId", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "channelId", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.RecordType": {} + } + }, + "nativeDataType": "struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=long].combinationref", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].currentodds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].eachway", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].livebetting", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].odds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].betoffertypeid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].criterionid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].criterionname", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].currentodds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventgroupid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=long].id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=string].name", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventname", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventstartdate", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=boolean].live", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].odds", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=long].outcomeids", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": { + "nestedType": [ + "long" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + }, { - "owner": "urn:li:corpuser:abc@acryl.io", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].outcomelabel", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, { - "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", - "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].sportid", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { - "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].status", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].voidreason", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].payout", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].rewardextrapayout", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].stake", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" } ] } @@ -1517,12 +1704,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90" + "container": "urn:li:container:5ada0a9773235325e506410c512feabb" } }, "systemMetadata": { @@ -1533,7 +1720,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1554,9 +1741,9 @@ "updated_by": "abc@acryl.io", "updated_at": "2022-10-19 13:27:29.633000+00:00" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default/quickstart_table", + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default/quickstart_table", "name": "quickstart_table", - "qualifiedName": "quickstart_catalog.default.quickstart_table", + "qualifiedName": "main.default.quickstart_table", "created": { "time": 1666185698688, "actor": "urn:li:corpuser:abc@acryl.io" @@ -1576,7 +1763,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1594,12 +1781,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "acryl_metastore.quickstart_catalog.default.quickstart_table", + "schemaName": "acryl_metastore.main.default.quickstart_table", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -1652,7 +1839,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -1677,7 +1864,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1688,12 +1875,12 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" + "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" }, { - "id": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", - "urn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90" + "id": "urn:li:container:5ada0a9773235325e506410c512feabb", + "urn": "urn:li:container:5ada0a9773235325e506410c512feabb" } ] } @@ -1706,7 +1893,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -1715,12 +1902,12 @@ "platform": "databricks", "env": "PROD", "metastore": "acryl metastore", - "catalog": "quickstart_catalog", - "unity_schema": "information_schema" + "catalog": "main", + "unity_schema": "quickstart_schema" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/information_schema", - "name": "information_schema", - "description": "Information schema (auto-created)" + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema", + "name": "quickstart_schema", + "description": "A new Unity Catalog schema called quickstart_schema" } }, "systemMetadata": { @@ -1731,7 +1918,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1747,7 +1934,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -1763,7 +1950,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1781,14 +1968,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { "json": { "owners": [ { - "owner": "urn:li:corpuser:Service Principal 1", + "owner": "urn:li:corpuser:account users", "type": "DATAOWNER" } ], @@ -1806,12 +1993,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" + "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" } }, "systemMetadata": { @@ -1822,7 +2009,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc", + "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1833,8 +2020,8 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" + "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" } ] } @@ -1847,12 +2034,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:29f99476d533719be0cebc374d5265dc" + "container": "urn:li:container:481380c5a355638fc626eca8380cdda9" } }, "systemMetadata": { @@ -1863,7 +2050,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1884,9 +2071,9 @@ "updated_by": "abc@acryl.io", "updated_at": "2022-10-19 13:27:29.633000+00:00" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/information_schema/quickstart_table", + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema/quickstart_table", "name": "quickstart_table", - "qualifiedName": "quickstart_catalog.information_schema.quickstart_table", + "qualifiedName": "main.quickstart_schema.quickstart_table", "created": { "time": 1666185698688, "actor": "urn:li:corpuser:abc@acryl.io" @@ -1906,7 +2093,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1924,12 +2111,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "acryl_metastore.quickstart_catalog.information_schema.quickstart_table", + "schemaName": "acryl_metastore.main.quickstart_schema.quickstart_table", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -1981,8 +2168,136 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:account users", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", + "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + }, + { + "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596", + "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596" + }, + { + "id": "urn:li:container:481380c5a355638fc626eca8380cdda9", + "urn": "urn:li:container:481380c5a355638fc626eca8380cdda9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "databricks", + "env": "PROD", + "metastore": "acryl metastore", + "catalog": "quickstart_catalog" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog", + "name": "quickstart_catalog", + "description": "" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:databricks" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Catalog" + ] + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -2006,8 +2321,24 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2016,14 +2347,6 @@ { "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" - }, - { - "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" - }, - { - "id": "urn:li:container:29f99476d533719be0cebc374d5265dc", - "urn": "urn:li:container:29f99476d533719be0cebc374d5265dc" } ] } @@ -2036,7 +2359,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -2046,11 +2369,11 @@ "env": "PROD", "metastore": "acryl metastore", "catalog": "quickstart_catalog", - "unity_schema": "quickstart_schema" + "unity_schema": "default" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema", - "name": "quickstart_schema", - "description": "A new Unity Catalog schema called quickstart_schema" + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default", + "name": "default", + "description": "Default schema (auto-created)" } }, "systemMetadata": { @@ -2061,7 +2384,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2077,7 +2400,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -2093,7 +2416,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2111,14 +2434,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { "json": { "owners": [ { - "owner": "urn:li:corpuser:account users", + "owner": "urn:li:corpuser:abc@acryl.io", "type": "DATAOWNER" } ], @@ -2136,7 +2459,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -2152,7 +2475,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2177,12 +2500,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:47a033e31b92a120f08f297c05d286f1" + "container": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90" } }, "systemMetadata": { @@ -2193,7 +2516,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -2214,9 +2537,9 @@ "updated_by": "abc@acryl.io", "updated_at": "2022-10-19 13:27:29.633000+00:00" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema/quickstart_table", + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default/quickstart_table", "name": "quickstart_table", - "qualifiedName": "quickstart_catalog.quickstart_schema.quickstart_table", + "qualifiedName": "quickstart_catalog.default.quickstart_table", "created": { "time": 1666185698688, "actor": "urn:li:corpuser:abc@acryl.io" @@ -2236,7 +2559,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2254,12 +2577,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table", + "schemaName": "acryl_metastore.quickstart_catalog.default.quickstart_table", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -2312,7 +2635,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -2324,153 +2647,9 @@ } ], "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", - "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" - }, - { - "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", - "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" - }, - { - "id": "urn:li:container:47a033e31b92a120f08f297c05d286f1", - "urn": "urn:li:container:47a033e31b92a120f08f297c05d286f1" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "databricks", - "env": "PROD", - "metastore": "acryl metastore", - "catalog": "system" - }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system", - "name": "system", - "description": "System catalog (auto-created)" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:databricks" - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Catalog" - ] - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ - { - "owner": "urn:li:corpuser:Service Principal 2", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } }, "systemMetadata": { @@ -2480,8 +2659,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2490,6 +2669,14 @@ { "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + }, + { + "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" + }, + { + "id": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90", + "urn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90" } ] } @@ -2502,7 +2689,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -2511,12 +2698,12 @@ "platform": "databricks", "env": "PROD", "metastore": "acryl metastore", - "catalog": "system", - "unity_schema": "default" + "catalog": "quickstart_catalog", + "unity_schema": "quickstart_schema" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default", - "name": "default", - "description": "Default schema (auto-created)" + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema", + "name": "quickstart_schema", + "description": "A new Unity Catalog schema called quickstart_schema" } }, "systemMetadata": { @@ -2527,7 +2714,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2543,7 +2730,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -2559,7 +2746,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2577,14 +2764,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { "json": { "owners": [ { - "owner": "urn:li:corpuser:abc@acryl.io", + "owner": "urn:li:corpuser:account users", "type": "DATAOWNER" } ], @@ -2602,12 +2789,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9" + "container": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" } }, "systemMetadata": { @@ -2618,7 +2805,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", + "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2629,8 +2816,8 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "urn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9" + "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" } ] } @@ -2643,12 +2830,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b330768923270ff5450695bee1c94247" + "container": "urn:li:container:47a033e31b92a120f08f297c05d286f1" } }, "systemMetadata": { @@ -2659,7 +2846,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -2680,9 +2867,9 @@ "updated_by": "abc@acryl.io", "updated_at": "2022-10-19 13:27:29.633000+00:00" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default/quickstart_table", + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema/quickstart_table", "name": "quickstart_table", - "qualifiedName": "system.default.quickstart_table", + "qualifiedName": "quickstart_catalog.quickstart_schema.quickstart_table", "created": { "time": 1666185698688, "actor": "urn:li:corpuser:abc@acryl.io" @@ -2702,7 +2889,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2720,12 +2907,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "acryl_metastore.system.default.quickstart_table", + "schemaName": "acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -2778,7 +2965,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -2803,7 +2990,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2814,12 +3001,148 @@ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" }, { - "id": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", - "urn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9" + "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965", + "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965" }, { - "id": "urn:li:container:b330768923270ff5450695bee1c94247", - "urn": "urn:li:container:b330768923270ff5450695bee1c94247" + "id": "urn:li:container:47a033e31b92a120f08f297c05d286f1", + "urn": "urn:li:container:47a033e31b92a120f08f297c05d286f1" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "databricks", + "env": "PROD", + "metastore": "acryl metastore", + "catalog": "system" + }, + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system", + "name": "system", + "description": "System catalog (auto-created)" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:databricks" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Catalog" + ] + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:Service Principal 2", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb", + "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb" } ] } @@ -2832,7 +3155,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -2842,11 +3165,11 @@ "env": "PROD", "metastore": "acryl metastore", "catalog": "system", - "unity_schema": "information_schema" + "unity_schema": "default" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/information_schema", - "name": "information_schema", - "description": "Information schema (auto-created)" + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default", + "name": "default", + "description": "Default schema (auto-created)" } }, "systemMetadata": { @@ -2857,7 +3180,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2873,7 +3196,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -2889,7 +3212,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2907,14 +3230,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { "json": { "owners": [ { - "owner": "urn:li:corpuser:Service Principal 1", + "owner": "urn:li:corpuser:abc@acryl.io", "type": "DATAOWNER" } ], @@ -2932,7 +3255,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -2948,7 +3271,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", + "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2973,12 +3296,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59" + "container": "urn:li:container:b330768923270ff5450695bee1c94247" } }, "systemMetadata": { @@ -2989,7 +3312,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -3010,9 +3333,9 @@ "updated_by": "abc@acryl.io", "updated_at": "2022-10-19 13:27:29.633000+00:00" }, - "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/information_schema/quickstart_table", + "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default/quickstart_table", "name": "quickstart_table", - "qualifiedName": "system.information_schema.quickstart_table", + "qualifiedName": "system.default.quickstart_table", "created": { "time": 1666185698688, "actor": "urn:li:corpuser:abc@acryl.io" @@ -3032,7 +3355,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -3050,12 +3373,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "acryl_metastore.system.information_schema.quickstart_table", + "schemaName": "acryl_metastore.system.default.quickstart_table", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -3108,7 +3431,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3133,7 +3456,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3148,8 +3471,8 @@ "urn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9" }, { - "id": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59", - "urn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59" + "id": "urn:li:container:b330768923270ff5450695bee1c94247", + "urn": "urn:li:container:b330768923270ff5450695bee1c94247" } ] } @@ -3506,22 +3829,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1638860400000, - "runId": "unity-catalog-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", @@ -3556,7 +3863,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -3572,7 +3879,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -3588,7 +3895,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.quickstart_schema.quickstart_table,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -3604,7 +3911,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -3620,7 +3927,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.quickstart_schema.quickstart_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/unit/test_unity_catalog_config.py b/metadata-ingestion/tests/unit/test_unity_catalog_config.py index 4098ed4074de2f..3c0994cde7889f 100644 --- a/metadata-ingestion/tests/unit/test_unity_catalog_config.py +++ b/metadata-ingestion/tests/unit/test_unity_catalog_config.py @@ -67,7 +67,6 @@ def test_profiling_requires_warehouses_id(): @freeze_time(FROZEN_TIME) def test_workspace_url_should_start_with_https(): - with pytest.raises(ValueError, match="Workspace URL must start with http scheme"): UnityCatalogSourceConfig.parse_obj( { @@ -76,3 +75,67 @@ def test_workspace_url_should_start_with_https(): "profiling": {"enabled": True}, } ) + + +def test_global_warehouse_id_is_set_from_profiling(): + config = UnityCatalogSourceConfig.parse_obj( + { + "token": "token", + "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX", + "profiling": { + "method": "ge", + "enabled": True, + "warehouse_id": "my_warehouse_id", + }, + } + ) + assert config.profiling.warehouse_id == "my_warehouse_id" + assert config.warehouse_id == "my_warehouse_id" + + +def test_set_different_warehouse_id_from_profiling(): + with pytest.raises( + ValueError, + match="When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`.", + ): + UnityCatalogSourceConfig.parse_obj( + { + "token": "token", + "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX", + "warehouse_id": "my_global_warehouse_id", + "profiling": { + "method": "ge", + "enabled": True, + "warehouse_id": "my_warehouse_id", + }, + } + ) + + +def test_warehouse_id_must_be_set_if_include_hive_metastore_is_true(): + with pytest.raises( + ValueError, + match="When `include_hive_metastore` is set, `warehouse_id` must be set.", + ): + UnityCatalogSourceConfig.parse_obj( + { + "token": "token", + "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX", + "include_hive_metastore": True, + } + ) + + +def test_set_profiling_warehouse_id_from_global(): + config = UnityCatalogSourceConfig.parse_obj( + { + "token": "token", + "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX", + "warehouse_id": "my_global_warehouse_id", + "profiling": { + "method": "ge", + "enabled": True, + }, + } + ) + assert config.profiling.warehouse_id == "my_global_warehouse_id" From 0d6a5e5df25b58af0a434d5d2f83f6ef463ba99b Mon Sep 17 00:00:00 2001 From: siddiquebagwan-gslab Date: Thu, 14 Dec 2023 21:06:28 +0530 Subject: [PATCH 12/17] feat(ingestion/transformer): create tag if not exist (#9076) --- .../src/datahub/ingestion/graph/client.py | 24 ++++++ .../ingestion/transformer/add_dataset_tags.py | 42 ++++++++++- .../ingestion/transformer/base_transformer.py | 75 +++++++++++++++---- .../tests/unit/test_transform_dataset.py | 32 ++++++-- 4 files changed, 154 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index d91165ac9777ca..5c24b06dde9998 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -787,9 +787,11 @@ def get_aspect_counts(self, aspect: str, urn_like: Optional[str] = None) -> int: def execute_graphql(self, query: str, variables: Optional[Dict] = None) -> Dict: url = f"{self.config.server}/api/graphql" + body: Dict = { "query": query, } + if variables: body["variables"] = variables @@ -1065,6 +1067,28 @@ def parse_sql_lineage( default_schema=default_schema, ) + def create_tag(self, tag_name: str) -> str: + graph_query: str = """ + mutation($tag_detail: CreateTagInput!) { + createTag(input: $tag_detail) + } + """ + + variables = { + "tag_detail": { + "name": tag_name, + "id": tag_name, + }, + } + + res = self.execute_graphql( + query=graph_query, + variables=variables, + ) + + # return urn + return res["createTag"] + def close(self) -> None: self._make_schema_resolver.cache_clear() super().close() diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py index 5a276ad899c482..72a8c226e491ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py @@ -1,14 +1,24 @@ +import logging from typing import Callable, List, Optional, cast +import datahub.emitter.mce_builder as builder from datahub.configuration.common import ( KeyValuePattern, TransformerSemanticsConfigModel, ) from datahub.configuration.import_resolver import pydantic_resolve_key from datahub.emitter.mce_builder import Aspect +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.transformer.dataset_transformer import DatasetTagsTransformer -from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass +from datahub.metadata.schema_classes import ( + GlobalTagsClass, + TagAssociationClass, + TagKeyClass, +) +from datahub.utilities.urns.tag_urn import TagUrn + +logger = logging.getLogger(__name__) class AddDatasetTagsConfig(TransformerSemanticsConfigModel): @@ -22,11 +32,13 @@ class AddDatasetTags(DatasetTagsTransformer): ctx: PipelineContext config: AddDatasetTagsConfig + processed_tags: List[TagAssociationClass] def __init__(self, config: AddDatasetTagsConfig, ctx: PipelineContext): super().__init__() self.ctx = ctx self.config = config + self.processed_tags = [] @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetTags": @@ -45,11 +57,38 @@ def transform_aspect( tags_to_add = self.config.get_tags_to_add(entity_urn) if tags_to_add is not None: out_global_tags_aspect.tags.extend(tags_to_add) + self.processed_tags.extend( + tags_to_add + ) # Keep track of tags added so that we can create them in handle_end_of_stream return self.get_result_semantics( self.config, self.ctx.graph, entity_urn, out_global_tags_aspect ) + def handle_end_of_stream(self) -> List[MetadataChangeProposalWrapper]: + + mcps: List[MetadataChangeProposalWrapper] = [] + + logger.debug("Generating tags") + + for tag_association in self.processed_tags: + ids: List[str] = TagUrn.create_from_string( + tag_association.tag + ).get_entity_id() + + assert len(ids) == 1, "Invalid Tag Urn" + + tag_name: str = ids[0] + + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=builder.make_tag_urn(tag=tag_name), + aspect=TagKeyClass(name=tag_name), + ) + ) + + return mcps + class SimpleDatasetTagConfig(TransformerSemanticsConfigModel): tag_urns: List[str] @@ -82,6 +121,7 @@ class PatternAddDatasetTags(AddDatasetTags): """Transformer that adds a specified set of tags to each dataset.""" def __init__(self, config: PatternDatasetTagsConfig, ctx: PipelineContext): + config.tag_pattern.all tag_pattern = config.tag_pattern generic_config = AddDatasetTagsConfig( get_tags_to_add=lambda _: [ diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py index e0d6ae720c9a18..8b6f42dcfba4b8 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py @@ -17,13 +17,30 @@ log = logging.getLogger(__name__) -class LegacyMCETransformer(Transformer, metaclass=ABCMeta): +def _update_work_unit_id( + envelope: RecordEnvelope, urn: str, aspect_name: str +) -> Dict[Any, Any]: + structured_urn = Urn.create_from_string(urn) + simple_name = "-".join(structured_urn.get_entity_id()) + record_metadata = envelope.metadata.copy() + record_metadata.update({"workunit_id": f"txform-{simple_name}-{aspect_name}"}) + return record_metadata + + +class HandleEndOfStreamTransformer: + def handle_end_of_stream(self) -> List[MetadataChangeProposalWrapper]: + return [] + + +class LegacyMCETransformer( + Transformer, HandleEndOfStreamTransformer, metaclass=ABCMeta +): @abstractmethod def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: pass -class SingleAspectTransformer(metaclass=ABCMeta): +class SingleAspectTransformer(HandleEndOfStreamTransformer, metaclass=ABCMeta): @abstractmethod def aspect_name(self) -> str: """Implement this method to specify a single aspect that the transformer is interested in subscribing to. No default provided.""" @@ -180,6 +197,32 @@ def _transform_or_record_mcpw( self._record_mcp(envelope.record) return envelope if envelope.record.aspect is not None else None + def _handle_end_of_stream( + self, envelope: RecordEnvelope + ) -> Iterable[RecordEnvelope]: + + if not isinstance(self, SingleAspectTransformer) and not isinstance( + self, LegacyMCETransformer + ): + return + + mcps: List[MetadataChangeProposalWrapper] = self.handle_end_of_stream() + + for mcp in mcps: + if mcp.aspect is None or mcp.entityUrn is None: # to silent the lint error + continue + + record_metadata = _update_work_unit_id( + envelope=envelope, + aspect_name=mcp.aspect.get_aspect_name(), # type: ignore + urn=mcp.entityUrn, + ) + + yield RecordEnvelope( + record=mcp, + metadata=record_metadata, + ) + def transform( self, record_envelopes: Iterable[RecordEnvelope] ) -> Iterable[RecordEnvelope]: @@ -216,17 +259,10 @@ def transform( else None, ) if transformed_aspect: - # for end of stream records, we modify the workunit-id structured_urn = Urn.create_from_string(urn) - simple_name = "-".join(structured_urn.get_entity_id()) - record_metadata = envelope.metadata.copy() - record_metadata.update( - { - "workunit_id": f"txform-{simple_name}-{self.aspect_name()}" - } - ) - yield RecordEnvelope( - record=MetadataChangeProposalWrapper( + + mcp: MetadataChangeProposalWrapper = ( + MetadataChangeProposalWrapper( entityUrn=urn, entityType=structured_urn.get_type(), systemMetadata=last_seen_mcp.systemMetadata @@ -234,8 +270,21 @@ def transform( else last_seen_mce_system_metadata, aspectName=self.aspect_name(), aspect=transformed_aspect, - ), + ) + ) + + record_metadata = _update_work_unit_id( + envelope=envelope, + aspect_name=mcp.aspect.get_aspect_name(), # type: ignore + urn=mcp.entityUrn, + ) + + yield RecordEnvelope( + record=mcp, metadata=record_metadata, ) + self._mark_processed(urn) + yield from self._handle_end_of_stream(envelope=envelope) + yield envelope diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 8014df2f5c519d..546549dcf37a4a 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -813,13 +813,25 @@ def test_simple_dataset_tags_transformation(mock_time): ] ) ) - assert len(outputs) == 3 + + assert len(outputs) == 5 # Check that tags were added. tags_aspect = outputs[1].record.aspect + assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation") assert tags_aspect assert len(tags_aspect.tags) == 2 - assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation") + + # Check new tag entity should be there + assert outputs[2].record.aspectName == "tagKey" + assert outputs[2].record.aspect.name == "NeedsDocumentation" + assert outputs[2].record.entityUrn == builder.make_tag_urn("NeedsDocumentation") + + assert outputs[3].record.aspectName == "tagKey" + assert outputs[3].record.aspect.name == "Legacy" + assert outputs[3].record.entityUrn == builder.make_tag_urn("Legacy") + + assert isinstance(outputs[4].record, EndOfStream) def dummy_tag_resolver_method(dataset_snapshot): @@ -853,7 +865,7 @@ def test_pattern_dataset_tags_transformation(mock_time): ) ) - assert len(outputs) == 3 + assert len(outputs) == 5 tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 @@ -1363,7 +1375,7 @@ def test_mcp_add_tags_missing(mock_time): ] input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={})) outputs = list(transformer.transform(input_stream)) - assert len(outputs) == 3 + assert len(outputs) == 5 assert outputs[0].record == dataset_mcp # Check that tags were added, this will be the second result tags_aspect = outputs[1].record.aspect @@ -1395,13 +1407,23 @@ def test_mcp_add_tags_existing(mock_time): ] input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={})) outputs = list(transformer.transform(input_stream)) - assert len(outputs) == 2 + + assert len(outputs) == 4 + # Check that tags were added, this will be the second result tags_aspect = outputs[0].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 3 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Test") assert tags_aspect.tags[1].tag == builder.make_tag_urn("NeedsDocumentation") + assert tags_aspect.tags[2].tag == builder.make_tag_urn("Legacy") + + # Check tag entities got added + assert outputs[1].record.entityType == "tag" + assert outputs[1].record.entityUrn == builder.make_tag_urn("NeedsDocumentation") + assert outputs[2].record.entityType == "tag" + assert outputs[2].record.entityUrn == builder.make_tag_urn("Legacy") + assert isinstance(outputs[-1].record, EndOfStream) From ecef50f8fc75309562cf2729380ed18d5020ae8b Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Thu, 14 Dec 2023 08:03:36 -0800 Subject: [PATCH 13/17] =?UTF-8?q?fix(ingest):=20make=20user=5Furn=20and=20?= =?UTF-8?q?group=5Furn=20generation=20consider=20user=20and=E2=80=A6=20(#9?= =?UTF-8?q?026)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Aseem Bansal --- .../src/datahub/emitter/mce_builder.py | 8 +++---- .../tests/unit/test_mce_builder.py | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 3b2c87ea25a314..9da1b0ab56f890 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -193,20 +193,20 @@ def assertion_urn_to_key(assertion_urn: str) -> Optional[AssertionKeyClass]: def make_user_urn(username: str) -> str: """ - Makes a user urn if the input is not a user urn already + Makes a user urn if the input is not a user or group urn already """ return ( f"urn:li:corpuser:{username}" - if not username.startswith("urn:li:corpuser:") + if not username.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")) else username ) def make_group_urn(groupname: str) -> str: """ - Makes a group urn if the input is not a group urn already + Makes a group urn if the input is not a user or group urn already """ - if groupname and groupname.startswith("urn:li:corpGroup:"): + if groupname and groupname.startswith(("urn:li:corpGroup:", "urn:li:corpuser:")): return groupname else: return f"urn:li:corpGroup:{groupname}" diff --git a/metadata-ingestion/tests/unit/test_mce_builder.py b/metadata-ingestion/tests/unit/test_mce_builder.py index b9025d76a3a1d1..d7c84f7863b407 100644 --- a/metadata-ingestion/tests/unit/test_mce_builder.py +++ b/metadata-ingestion/tests/unit/test_mce_builder.py @@ -33,3 +33,25 @@ def test_create_dataset_urn_with_reserved_chars() -> None: ) == "urn:li:dataset:(urn:li:dataPlatform:platform%29,platform%2Cinstance.table_%28name%29,PROD)" ) + + +def test_make_user_urn() -> None: + assert builder.make_user_urn("someUser") == "urn:li:corpuser:someUser" + assert ( + builder.make_user_urn("urn:li:corpuser:someUser") == "urn:li:corpuser:someUser" + ) + assert ( + builder.make_user_urn("urn:li:corpGroup:someGroup") + == "urn:li:corpGroup:someGroup" + ) + + +def test_make_group_urn() -> None: + assert builder.make_group_urn("someGroup") == "urn:li:corpGroup:someGroup" + assert ( + builder.make_group_urn("urn:li:corpGroup:someGroup") + == "urn:li:corpGroup:someGroup" + ) + assert ( + builder.make_group_urn("urn:li:corpuser:someUser") == "urn:li:corpuser:someUser" + ) From 1741c07d769f56a9cf066172725384b4e8780839 Mon Sep 17 00:00:00 2001 From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com> Date: Thu, 14 Dec 2023 23:01:51 +0530 Subject: [PATCH 14/17] feat(ingestion): Add test_connection methods for important sources (#9334) --- .../datahub/ingestion/source/dbt/dbt_cloud.py | 89 ++-- .../datahub/ingestion/source/dbt/dbt_core.py | 56 ++- .../src/datahub/ingestion/source/kafka.py | 74 ++- .../ingestion/source/powerbi/powerbi.py | 22 +- .../ingestion/source/sql/sql_common.py | 26 +- .../src/datahub/ingestion/source/tableau.py | 23 +- .../ingestion/source_config/sql/snowflake.py | 2 +- .../tests/integration/dbt/test_dbt.py | 69 ++- .../tests/integration/kafka/test_kafka.py | 85 +++- .../tests/integration/mysql/test_mysql.py | 38 +- .../tests/integration/powerbi/test_powerbi.py | 23 +- .../tableau/test_tableau_ingest.py | 21 +- .../test_helpers/test_connection_helpers.py | 47 ++ .../tests/unit/test_snowflake_source.py | 428 +++++++----------- .../tests/unit/test_sql_common.py | 62 ++- 15 files changed, 684 insertions(+), 381 deletions(-) create mode 100644 metadata-ingestion/tests/test_helpers/test_connection_helpers.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index a9685b2554553d..069c1f2781460a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -14,7 +14,12 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceCapability +from datahub.ingestion.api.source import ( + CapabilityReport, + SourceCapability, + TestableSource, + TestConnectionReport, +) from datahub.ingestion.source.dbt.dbt_common import ( DBTColumn, DBTCommonConfig, @@ -177,7 +182,7 @@ class DBTCloudConfig(DBTCommonConfig): @support_status(SupportStatus.INCUBATING) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class DBTCloudSource(DBTSourceBase): +class DBTCloudSource(DBTSourceBase, TestableSource): """ This source pulls dbt metadata directly from the dbt Cloud APIs. @@ -199,6 +204,57 @@ def create(cls, config_dict, ctx): config = DBTCloudConfig.parse_obj(config_dict) return cls(config, ctx, "dbt") + @staticmethod + def test_connection(config_dict: dict) -> TestConnectionReport: + test_report = TestConnectionReport() + try: + source_config = DBTCloudConfig.parse_obj_allow_extras(config_dict) + DBTCloudSource._send_graphql_query( + metadata_endpoint=source_config.metadata_endpoint, + token=source_config.token, + query=_DBT_GRAPHQL_QUERY.format(type="tests", fields="jobId"), + variables={ + "jobId": source_config.job_id, + "runId": source_config.run_id, + }, + ) + test_report.basic_connectivity = CapabilityReport(capable=True) + except Exception as e: + test_report.basic_connectivity = CapabilityReport( + capable=False, failure_reason=str(e) + ) + return test_report + + @staticmethod + def _send_graphql_query( + metadata_endpoint: str, token: str, query: str, variables: Dict + ) -> Dict: + logger.debug(f"Sending GraphQL query to dbt Cloud: {query}") + response = requests.post( + metadata_endpoint, + json={ + "query": query, + "variables": variables, + }, + headers={ + "Authorization": f"Bearer {token}", + "X-dbt-partner-source": "acryldatahub", + }, + ) + + try: + res = response.json() + if "errors" in res: + raise ValueError( + f'Unable to fetch metadata from dbt Cloud: {res["errors"]}' + ) + data = res["data"] + except JSONDecodeError as e: + response.raise_for_status() + raise e + + return data + def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: # TODO: In dbt Cloud, commands are scheduled as part of jobs, where # each job can have multiple runs. We currently only fully support @@ -213,6 +269,8 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: for node_type, fields in _DBT_FIELDS_BY_TYPE.items(): logger.info(f"Fetching {node_type} from dbt Cloud") data = self._send_graphql_query( + metadata_endpoint=self.config.metadata_endpoint, + token=self.config.token, query=_DBT_GRAPHQL_QUERY.format(type=node_type, fields=fields), variables={ "jobId": self.config.job_id, @@ -232,33 +290,6 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: return nodes, additional_metadata - def _send_graphql_query(self, query: str, variables: Dict) -> Dict: - logger.debug(f"Sending GraphQL query to dbt Cloud: {query}") - response = requests.post( - self.config.metadata_endpoint, - json={ - "query": query, - "variables": variables, - }, - headers={ - "Authorization": f"Bearer {self.config.token}", - "X-dbt-partner-source": "acryldatahub", - }, - ) - - try: - res = response.json() - if "errors" in res: - raise ValueError( - f'Unable to fetch metadata from dbt Cloud: {res["errors"]}' - ) - data = res["data"] - except JSONDecodeError as e: - response.raise_for_status() - raise e - - return data - def _parse_into_dbt_node(self, node: Dict) -> DBTNode: key = node["uniqueId"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index ac2b2815f3caaa..563b005d7a88d2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -18,7 +18,12 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceCapability +from datahub.ingestion.api.source import ( + CapabilityReport, + SourceCapability, + TestableSource, + TestConnectionReport, +) from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.dbt.dbt_common import ( DBTColumn, @@ -60,11 +65,6 @@ class DBTCoreConfig(DBTCommonConfig): _github_info_deprecated = pydantic_renamed_field("github_info", "git_info") - @property - def s3_client(self): - assert self.aws_connection - return self.aws_connection.get_s3_client() - @validator("aws_connection") def aws_connection_needed_if_s3_uris_present( cls, aws_connection: Optional[AwsConnectionConfig], values: Dict, **kwargs: Any @@ -363,7 +363,7 @@ def load_test_results( @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class DBTCoreSource(DBTSourceBase): +class DBTCoreSource(DBTSourceBase, TestableSource): """ The artifacts used by this source are: - [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json) @@ -387,12 +387,34 @@ def create(cls, config_dict, ctx): config = DBTCoreConfig.parse_obj(config_dict) return cls(config, ctx, "dbt") - def load_file_as_json(self, uri: str) -> Any: + @staticmethod + def test_connection(config_dict: dict) -> TestConnectionReport: + test_report = TestConnectionReport() + try: + source_config = DBTCoreConfig.parse_obj_allow_extras(config_dict) + DBTCoreSource.load_file_as_json( + source_config.manifest_path, source_config.aws_connection + ) + DBTCoreSource.load_file_as_json( + source_config.catalog_path, source_config.aws_connection + ) + test_report.basic_connectivity = CapabilityReport(capable=True) + except Exception as e: + test_report.basic_connectivity = CapabilityReport( + capable=False, failure_reason=str(e) + ) + return test_report + + @staticmethod + def load_file_as_json( + uri: str, aws_connection: Optional[AwsConnectionConfig] + ) -> Dict: if re.match("^https?://", uri): return json.loads(requests.get(uri).text) elif re.match("^s3://", uri): u = urlparse(uri) - response = self.config.s3_client.get_object( + assert aws_connection + response = aws_connection.get_s3_client().get_object( Bucket=u.netloc, Key=u.path.lstrip("/") ) return json.loads(response["Body"].read().decode("utf-8")) @@ -410,12 +432,18 @@ def loadManifestAndCatalog( Optional[str], Optional[str], ]: - dbt_manifest_json = self.load_file_as_json(self.config.manifest_path) + dbt_manifest_json = self.load_file_as_json( + self.config.manifest_path, self.config.aws_connection + ) - dbt_catalog_json = self.load_file_as_json(self.config.catalog_path) + dbt_catalog_json = self.load_file_as_json( + self.config.catalog_path, self.config.aws_connection + ) if self.config.sources_path is not None: - dbt_sources_json = self.load_file_as_json(self.config.sources_path) + dbt_sources_json = self.load_file_as_json( + self.config.sources_path, self.config.aws_connection + ) sources_results = dbt_sources_json["results"] else: sources_results = {} @@ -491,7 +519,9 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: # This will populate the test_results field on each test node. all_nodes = load_test_results( self.config, - self.load_file_as_json(self.config.test_results_path), + self.load_file_as_json( + self.config.test_results_path, self.config.aws_connection + ), all_nodes, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index 25520e7aa66fff..99ef737206ab0c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -15,6 +15,7 @@ ConfigResource, TopicMetadata, ) +from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient from datahub.configuration.common import AllowDenyPattern from datahub.configuration.kafka import KafkaConsumerConnectionConfig @@ -40,7 +41,13 @@ support_status, ) from datahub.ingestion.api.registry import import_path -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability +from datahub.ingestion.api.source import ( + CapabilityReport, + MetadataWorkUnitProcessor, + SourceCapability, + TestableSource, + TestConnectionReport, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase @@ -133,6 +140,18 @@ class KafkaSourceConfig( ) +def get_kafka_consumer( + connection: KafkaConsumerConnectionConfig, +) -> confluent_kafka.Consumer: + return confluent_kafka.Consumer( + { + "group.id": "test", + "bootstrap.servers": connection.bootstrap, + **connection.consumer_config, + } + ) + + @dataclass class KafkaSourceReport(StaleEntityRemovalSourceReport): topics_scanned: int = 0 @@ -145,6 +164,45 @@ def report_dropped(self, topic: str) -> None: self.filtered.append(topic) +class KafkaConnectionTest: + def __init__(self, config_dict: dict): + self.config = KafkaSourceConfig.parse_obj_allow_extras(config_dict) + self.report = KafkaSourceReport() + self.consumer: confluent_kafka.Consumer = get_kafka_consumer( + self.config.connection + ) + + def get_connection_test(self) -> TestConnectionReport: + capability_report = { + SourceCapability.SCHEMA_METADATA: self.schema_registry_connectivity(), + } + return TestConnectionReport( + basic_connectivity=self.basic_connectivity(), + capability_report={ + k: v for k, v in capability_report.items() if v is not None + }, + ) + + def basic_connectivity(self) -> CapabilityReport: + try: + self.consumer.list_topics(timeout=10) + return CapabilityReport(capable=True) + except Exception as e: + return CapabilityReport(capable=False, failure_reason=str(e)) + + def schema_registry_connectivity(self) -> CapabilityReport: + try: + SchemaRegistryClient( + { + "url": self.config.connection.schema_registry_url, + **self.config.connection.schema_registry_config, + } + ).get_subjects() + return CapabilityReport(capable=True) + except Exception as e: + return CapabilityReport(capable=False, failure_reason=str(e)) + + @platform_name("Kafka") @config_class(KafkaSourceConfig) @support_status(SupportStatus.CERTIFIED) @@ -160,7 +218,7 @@ def report_dropped(self, topic: str) -> None: SourceCapability.SCHEMA_METADATA, "Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.", ) -class KafkaSource(StatefulIngestionSourceBase): +class KafkaSource(StatefulIngestionSourceBase, TestableSource): """ This plugin extracts the following: - Topics from the Kafka broker @@ -183,12 +241,8 @@ def create_schema_registry( def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) self.source_config: KafkaSourceConfig = config - self.consumer: confluent_kafka.Consumer = confluent_kafka.Consumer( - { - "group.id": "test", - "bootstrap.servers": self.source_config.connection.bootstrap, - **self.source_config.connection.consumer_config, - } + self.consumer: confluent_kafka.Consumer = get_kafka_consumer( + self.source_config.connection ) self.init_kafka_admin_client() self.report: KafkaSourceReport = KafkaSourceReport() @@ -226,6 +280,10 @@ def init_kafka_admin_client(self) -> None: f"Failed to create Kafka Admin Client due to error {e}.", ) + @staticmethod + def test_connection(config_dict: dict) -> TestConnectionReport: + return KafkaConnectionTest(config_dict).get_connection_test() + @classmethod def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource": config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 4b1d0403ac7760..cdf7c975c0614f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -19,7 +19,13 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.source import ( + CapabilityReport, + MetadataWorkUnitProcessor, + SourceReport, + TestableSource, + TestConnectionReport, +) from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( @@ -1147,7 +1153,7 @@ def report_to_datahub_work_units( SourceCapability.LINEAGE_FINE, "Disabled by default, configured using `extract_column_level_lineage`. ", ) -class PowerBiDashboardSource(StatefulIngestionSourceBase): +class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource): """ This plugin extracts the following: - Power BI dashboards, tiles and datasets @@ -1186,6 +1192,18 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): self, self.source_config, self.ctx ) + @staticmethod + def test_connection(config_dict: dict) -> TestConnectionReport: + test_report = TestConnectionReport() + try: + PowerBiAPI(PowerBiDashboardSourceConfig.parse_obj_allow_extras(config_dict)) + test_report.basic_connectivity = CapabilityReport(capable=True) + except Exception as e: + test_report.basic_connectivity = CapabilityReport( + capable=False, failure_reason=str(e) + ) + return test_report + @classmethod def create(cls, config_dict, ctx): config = PowerBiDashboardSourceConfig.parse_obj(config_dict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 590bc7f696784e..a831dfa50342d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -15,6 +15,7 @@ Tuple, Type, Union, + cast, ) import sqlalchemy.dialects.postgresql.base @@ -35,7 +36,12 @@ from datahub.emitter.sql_parsing_builder import SqlParsingBuilder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage -from datahub.ingestion.api.source import MetadataWorkUnitProcessor +from datahub.ingestion.api.source import ( + CapabilityReport, + MetadataWorkUnitProcessor, + TestableSource, + TestConnectionReport, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, @@ -298,7 +304,7 @@ class ProfileMetadata: dataset_name_to_storage_bytes: Dict[str, int] = field(default_factory=dict) -class SQLAlchemySource(StatefulIngestionSourceBase): +class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource): """A Base class for all SQL Sources that use SQLAlchemy to extend""" def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str): @@ -348,6 +354,22 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) else: self._view_definition_cache = {} + @classmethod + def test_connection(cls, config_dict: dict) -> TestConnectionReport: + test_report = TestConnectionReport() + try: + source = cast( + SQLAlchemySource, + cls.create(config_dict, PipelineContext(run_id="test_connection")), + ) + list(source.get_inspectors()) + test_report.basic_connectivity = CapabilityReport(capable=True) + except Exception as e: + test_report.basic_connectivity = CapabilityReport( + capable=False, failure_reason=str(e) + ) + return test_report + def warn(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason[:100]) log.warning(f"{key} => {reason}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index f870e99df27c5f..ed5fe543310b8f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -58,7 +58,13 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source +from datahub.ingestion.api.source import ( + CapabilityReport, + MetadataWorkUnitProcessor, + Source, + TestableSource, + TestConnectionReport, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source import tableau_constant as c from datahub.ingestion.source.common.subtypes import ( @@ -469,7 +475,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): SourceCapability.LINEAGE_FINE, "Enabled by default, configure using `extract_column_level_lineage`", ) -class TableauSource(StatefulIngestionSourceBase): +class TableauSource(StatefulIngestionSourceBase, TestableSource): platform = "tableau" def __hash__(self): @@ -509,6 +515,19 @@ def __init__( self._authenticate() + @staticmethod + def test_connection(config_dict: dict) -> TestConnectionReport: + test_report = TestConnectionReport() + try: + source_config = TableauConfig.parse_obj_allow_extras(config_dict) + source_config.make_tableau_client() + test_report.basic_connectivity = CapabilityReport(capable=True) + except Exception as e: + test_report.basic_connectivity = CapabilityReport( + capable=False, failure_reason=str(e) + ) + return test_report + def close(self) -> None: try: if self.server is not None: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index ccc4e115729a2c..46bd24c7e1f4c3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -143,7 +143,7 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: "'oauth_config' is none but should be set when using OAUTH_AUTHENTICATOR authentication" ) if oauth_config.use_certificate is True: - if oauth_config.provider == OAuthIdentityProvider.OKTA.value: + if oauth_config.provider == OAuthIdentityProvider.OKTA: raise ValueError( "Certificate authentication is not supported for Okta." ) diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index 95b5374bbb41df..587831495c1ea7 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -10,20 +10,25 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig from datahub.ingestion.source.dbt.dbt_common import DBTEntitiesEnabled, EmitDirective -from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig +from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig, DBTCoreSource from datahub.ingestion.source.sql.sql_types import ( ATHENA_SQL_TYPES_MAP, TRINO_SQL_TYPES_MAP, resolve_athena_modified_type, resolve_trino_modified_type, ) -from tests.test_helpers import mce_helpers +from tests.test_helpers import mce_helpers, test_connection_helpers FROZEN_TIME = "2022-02-03 07:00:00" GMS_PORT = 8080 GMS_SERVER = f"http://localhost:{GMS_PORT}" +@pytest.fixture(scope="module") +def test_resources_dir(pytestconfig): + return pytestconfig.rootpath / "tests/integration/dbt" + + @dataclass class DbtTestConfig: run_id: str @@ -195,7 +200,14 @@ def set_paths( ) @pytest.mark.integration @freeze_time(FROZEN_TIME) -def test_dbt_ingest(dbt_test_config, pytestconfig, tmp_path, mock_time, requests_mock): +def test_dbt_ingest( + dbt_test_config, + test_resources_dir, + pytestconfig, + tmp_path, + mock_time, + requests_mock, +): config: DbtTestConfig = dbt_test_config test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" @@ -233,11 +245,48 @@ def test_dbt_ingest(dbt_test_config, pytestconfig, tmp_path, mock_time, requests ) +@pytest.mark.parametrize( + "config_dict, is_success", + [ + ( + { + "manifest_path": "dbt_manifest.json", + "catalog_path": "dbt_catalog.json", + "target_platform": "postgres", + }, + True, + ), + ( + { + "manifest_path": "dbt_manifest.json", + "catalog_path": "dbt_catalog-this-file-does-not-exist.json", + "target_platform": "postgres", + }, + False, + ), + ], +) @pytest.mark.integration @freeze_time(FROZEN_TIME) -def test_dbt_tests(pytestconfig, tmp_path, mock_time, **kwargs): - test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" +def test_dbt_test_connection(test_resources_dir, config_dict, is_success): + config_dict["manifest_path"] = str( + (test_resources_dir / config_dict["manifest_path"]).resolve() + ) + config_dict["catalog_path"] = str( + (test_resources_dir / config_dict["catalog_path"]).resolve() + ) + report = test_connection_helpers.run_test_connection(DBTCoreSource, config_dict) + if is_success: + test_connection_helpers.assert_basic_connectivity_success(report) + else: + test_connection_helpers.assert_basic_connectivity_failure( + report, "No such file or directory" + ) + +@pytest.mark.integration +@freeze_time(FROZEN_TIME) +def test_dbt_tests(test_resources_dir, pytestconfig, tmp_path, mock_time, **kwargs): # Run the metadata ingestion pipeline. output_file = tmp_path / "dbt_test_events.json" golden_path = test_resources_dir / "dbt_test_events_golden.json" @@ -340,9 +389,9 @@ def test_resolve_athena_modified_type(data_type, expected_data_type): @pytest.mark.integration @freeze_time(FROZEN_TIME) -def test_dbt_tests_only_assertions(pytestconfig, tmp_path, mock_time, **kwargs): - test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" - +def test_dbt_tests_only_assertions( + test_resources_dir, pytestconfig, tmp_path, mock_time, **kwargs +): # Run the metadata ingestion pipeline. output_file = tmp_path / "test_only_assertions.json" @@ -418,10 +467,8 @@ def test_dbt_tests_only_assertions(pytestconfig, tmp_path, mock_time, **kwargs): @pytest.mark.integration @freeze_time(FROZEN_TIME) def test_dbt_only_test_definitions_and_results( - pytestconfig, tmp_path, mock_time, **kwargs + test_resources_dir, pytestconfig, tmp_path, mock_time, **kwargs ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" - # Run the metadata ingestion pipeline. output_file = tmp_path / "test_only_definitions_and_assertions.json" diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index 63d284801c94cd..dfdbea5de5cbfd 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -3,18 +3,22 @@ import pytest from freezegun import freeze_time -from tests.test_helpers import mce_helpers +from datahub.ingestion.api.source import SourceCapability +from datahub.ingestion.source.kafka import KafkaSource +from tests.test_helpers import mce_helpers, test_connection_helpers from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import wait_for_port FROZEN_TIME = "2020-04-14 07:00:00" -@freeze_time(FROZEN_TIME) -@pytest.mark.integration -def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): - test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" +@pytest.fixture(scope="module") +def test_resources_dir(pytestconfig): + return pytestconfig.rootpath / "tests/integration/kafka" + +@pytest.fixture(scope="module") +def mock_kafka_service(docker_compose_runner, test_resources_dir): with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka", cleanup=False ) as docker_services: @@ -31,14 +35,67 @@ def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) - # Run the metadata ingestion pipeline. - config_file = (test_resources_dir / "kafka_to_file.yml").resolve() - run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) + yield docker_compose_runner + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_kafka_ingest( + mock_kafka_service, test_resources_dir, pytestconfig, tmp_path, mock_time +): + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "kafka_to_file.yml").resolve() + run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) - # Verify the output. - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / "kafka_mces.json", - golden_path=test_resources_dir / "kafka_mces_golden.json", - ignore_paths=[], + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "kafka_mces.json", + golden_path=test_resources_dir / "kafka_mces_golden.json", + ignore_paths=[], + ) + + +@pytest.mark.parametrize( + "config_dict, is_success", + [ + ( + { + "connection": { + "bootstrap": "localhost:29092", + "schema_registry_url": "http://localhost:28081", + }, + }, + True, + ), + ( + { + "connection": { + "bootstrap": "localhost:2909", + "schema_registry_url": "http://localhost:2808", + }, + }, + False, + ), + ], +) +@pytest.mark.integration +@freeze_time(FROZEN_TIME) +def test_kafka_test_connection(mock_kafka_service, config_dict, is_success): + report = test_connection_helpers.run_test_connection(KafkaSource, config_dict) + if is_success: + test_connection_helpers.assert_basic_connectivity_success(report) + test_connection_helpers.assert_capability_report( + capability_report=report.capability_report, + success_capabilities=[SourceCapability.SCHEMA_METADATA], + ) + else: + test_connection_helpers.assert_basic_connectivity_failure( + report, "Failed to get metadata" + ) + test_connection_helpers.assert_capability_report( + capability_report=report.capability_report, + failure_capabilities={ + SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection" + }, ) diff --git a/metadata-ingestion/tests/integration/mysql/test_mysql.py b/metadata-ingestion/tests/integration/mysql/test_mysql.py index 23fd97ff2671ed..c19198c7d2bbd0 100644 --- a/metadata-ingestion/tests/integration/mysql/test_mysql.py +++ b/metadata-ingestion/tests/integration/mysql/test_mysql.py @@ -3,7 +3,8 @@ import pytest from freezegun import freeze_time -from tests.test_helpers import mce_helpers +from datahub.ingestion.source.sql.mysql import MySQLSource +from tests.test_helpers import mce_helpers, test_connection_helpers from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import wait_for_port @@ -75,3 +76,38 @@ def test_mysql_ingest_no_db( output_path=tmp_path / "mysql_mces.json", golden_path=test_resources_dir / golden_file, ) + + +@pytest.mark.parametrize( + "config_dict, is_success", + [ + ( + { + "host_port": "localhost:53307", + "database": "northwind", + "username": "root", + "password": "example", + }, + True, + ), + ( + { + "host_port": "localhost:5330", + "database": "wrong_db", + "username": "wrong_user", + "password": "wrong_pass", + }, + False, + ), + ], +) +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_mysql_test_connection(mysql_runner, config_dict, is_success): + report = test_connection_helpers.run_test_connection(MySQLSource, config_dict) + if is_success: + test_connection_helpers.assert_basic_connectivity_success(report) + else: + test_connection_helpers.assert_basic_connectivity_failure( + report, "Connection refused" + ) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index b2cbccf983eb0c..4e8469f919db9c 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -21,7 +21,7 @@ Report, Workspace, ) -from tests.test_helpers import mce_helpers +from tests.test_helpers import mce_helpers, test_connection_helpers pytestmark = pytest.mark.integration_batch_2 FROZEN_TIME = "2022-02-03 07:00:00" @@ -681,6 +681,27 @@ def test_powerbi_ingest( ) +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +@pytest.mark.integration +def test_powerbi_test_connection_success(mock_msal): + report = test_connection_helpers.run_test_connection( + PowerBiDashboardSource, default_source_config() + ) + test_connection_helpers.assert_basic_connectivity_success(report) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_powerbi_test_connection_failure(): + report = test_connection_helpers.run_test_connection( + PowerBiDashboardSource, default_source_config() + ) + test_connection_helpers.assert_basic_connectivity_failure( + report, "Unable to get authority configuration" + ) + + @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 0510f4a40f6597..90fa71013338da 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -28,7 +28,7 @@ ) from datahub.metadata.schema_classes import MetadataChangeProposalClass, UpstreamClass from datahub.utilities.sqlglot_lineage import SqlParsingResult -from tests.test_helpers import mce_helpers +from tests.test_helpers import mce_helpers, test_connection_helpers from tests.test_helpers.state_helpers import ( get_current_checkpoint_from_pipeline, validate_all_providers_have_committed_successfully, @@ -290,6 +290,25 @@ def test_tableau_ingest(pytestconfig, tmp_path, mock_datahub_graph): ) +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_tableau_test_connection_success(): + with mock.patch("datahub.ingestion.source.tableau.Server"): + report = test_connection_helpers.run_test_connection( + TableauSource, config_source_default + ) + test_connection_helpers.assert_basic_connectivity_success(report) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_tableau_test_connection_failure(): + report = test_connection_helpers.run_test_connection( + TableauSource, config_source_default + ) + test_connection_helpers.assert_basic_connectivity_failure(report, "Unable to login") + + @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): diff --git a/metadata-ingestion/tests/test_helpers/test_connection_helpers.py b/metadata-ingestion/tests/test_helpers/test_connection_helpers.py new file mode 100644 index 00000000000000..45543033ae010c --- /dev/null +++ b/metadata-ingestion/tests/test_helpers/test_connection_helpers.py @@ -0,0 +1,47 @@ +from typing import Dict, List, Optional, Type, Union + +from datahub.ingestion.api.source import ( + CapabilityReport, + SourceCapability, + TestableSource, + TestConnectionReport, +) + + +def run_test_connection( + source_cls: Type[TestableSource], config_dict: Dict +) -> TestConnectionReport: + return source_cls.test_connection(config_dict) + + +def assert_basic_connectivity_success(report: TestConnectionReport) -> None: + assert report is not None + assert report.basic_connectivity + assert report.basic_connectivity.capable + assert report.basic_connectivity.failure_reason is None + + +def assert_basic_connectivity_failure( + report: TestConnectionReport, expected_reason: str +) -> None: + assert report is not None + assert report.basic_connectivity + assert not report.basic_connectivity.capable + assert report.basic_connectivity.failure_reason + assert expected_reason in report.basic_connectivity.failure_reason + + +def assert_capability_report( + capability_report: Optional[Dict[Union[SourceCapability, str], CapabilityReport]], + success_capabilities: List[SourceCapability] = [], + failure_capabilities: Dict[SourceCapability, str] = {}, +) -> None: + assert capability_report + for capability in success_capabilities: + assert capability_report[capability] + assert capability_report[capability].failure_reason is None + for capability, expected_reason in failure_capabilities.items(): + assert not capability_report[capability].capable + failure_reason = capability_report[capability].failure_reason + assert failure_reason + assert expected_reason in failure_reason diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 343f4466fd6fdf..536c91ace4f5ed 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -1,3 +1,4 @@ +from typing import Any, Dict from unittest.mock import MagicMock, patch import pytest @@ -24,10 +25,20 @@ SnowflakeObjectAccessEntry, ) from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source +from tests.test_helpers import test_connection_helpers + +default_oauth_dict: Dict[str, Any] = { + "client_id": "client_id", + "client_secret": "secret", + "use_certificate": False, + "provider": "microsoft", + "scopes": ["datahub_role"], + "authority_url": "https://dev-abc.okta.com/oauth2/def/v1/token", +} def test_snowflake_source_throws_error_on_account_id_missing(): - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match="account_id\n field required"): SnowflakeV2Config.parse_obj( { "username": "user", @@ -37,27 +48,21 @@ def test_snowflake_source_throws_error_on_account_id_missing(): def test_no_client_id_invalid_oauth_config(): - oauth_dict = { - "provider": "microsoft", - "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], - "client_secret": "6Hb9apkbc6HD7", - "authority_url": "https://login.microsoftonline.com/yourorganisation.com", - } - with pytest.raises(ValueError): + oauth_dict = default_oauth_dict.copy() + del oauth_dict["client_id"] + with pytest.raises(ValueError, match="client_id\n field required"): OAuthConfiguration.parse_obj(oauth_dict) def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_false(): - oauth_dict = { - "client_id": "882e9831-7ea51cb2b954", - "provider": "microsoft", - "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], - "use_certificate": False, - "authority_url": "https://login.microsoftonline.com/yourorganisation.com", - } + oauth_dict = default_oauth_dict.copy() + del oauth_dict["client_secret"] OAuthConfiguration.parse_obj(oauth_dict) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="'oauth_config.client_secret' was none but should be set when using use_certificate false for oauth_config", + ): SnowflakeV2Config.parse_obj( { "account_id": "test", @@ -68,16 +73,13 @@ def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_f def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_certificate_is_true(): - oauth_dict = { - "client_id": "882e9831-7ea51cb2b954", - "provider": "microsoft", - "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], - "use_certificate": True, - "authority_url": "https://login.microsoftonline.com/yourorganisation.com", - "encoded_oauth_public_key": "fkdsfhkshfkjsdfiuwrwfkjhsfskfhksjf==", - } + oauth_dict = default_oauth_dict.copy() + oauth_dict["use_certificate"] = True OAuthConfiguration.parse_obj(oauth_dict) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="'base64_encoded_oauth_private_key' was none but should be set when using certificate for oauth_config", + ): SnowflakeV2Config.parse_obj( { "account_id": "test", @@ -88,16 +90,13 @@ def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_cert def test_snowflake_oauth_okta_does_not_support_certificate(): - oauth_dict = { - "client_id": "882e9831-7ea51cb2b954", - "provider": "okta", - "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], - "use_certificate": True, - "authority_url": "https://login.microsoftonline.com/yourorganisation.com", - "encoded_oauth_public_key": "fkdsfhkshfkjsdfiuwrwfkjhsfskfhksjf==", - } + oauth_dict = default_oauth_dict.copy() + oauth_dict["use_certificate"] = True + oauth_dict["provider"] = "okta" OAuthConfiguration.parse_obj(oauth_dict) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match="Certificate authentication is not supported for Okta." + ): SnowflakeV2Config.parse_obj( { "account_id": "test", @@ -108,79 +107,52 @@ def test_snowflake_oauth_okta_does_not_support_certificate(): def test_snowflake_oauth_happy_paths(): - okta_dict = { - "client_id": "client_id", - "client_secret": "secret", - "provider": "okta", - "scopes": ["datahub_role"], - "authority_url": "https://dev-abc.okta.com/oauth2/def/v1/token", - } + oauth_dict = default_oauth_dict.copy() + oauth_dict["provider"] = "okta" assert SnowflakeV2Config.parse_obj( { "account_id": "test", "authentication_type": "OAUTH_AUTHENTICATOR", - "oauth_config": okta_dict, + "oauth_config": oauth_dict, } ) - - microsoft_dict = { - "client_id": "client_id", - "provider": "microsoft", - "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], - "use_certificate": True, - "authority_url": "https://login.microsoftonline.com/yourorganisation.com", - "encoded_oauth_public_key": "publickey", - "encoded_oauth_private_key": "privatekey", - } + oauth_dict["use_certificate"] = True + oauth_dict["provider"] = "microsoft" + oauth_dict["encoded_oauth_public_key"] = "publickey" + oauth_dict["encoded_oauth_private_key"] = "privatekey" assert SnowflakeV2Config.parse_obj( { "account_id": "test", "authentication_type": "OAUTH_AUTHENTICATOR", - "oauth_config": microsoft_dict, + "oauth_config": oauth_dict, } ) +default_config_dict: Dict[str, Any] = { + "username": "user", + "password": "password", + "account_id": "https://acctname.snowflakecomputing.com", + "warehouse": "COMPUTE_WH", + "role": "sysadmin", +} + + def test_account_id_is_added_when_host_port_is_present(): - config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "host_port": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - ) + config_dict = default_config_dict.copy() + del config_dict["account_id"] + config_dict["host_port"] = "acctname" + config = SnowflakeV2Config.parse_obj(config_dict) assert config.account_id == "acctname" def test_account_id_with_snowflake_host_suffix(): - config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "https://acctname.snowflakecomputing.com", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - ) + config = SnowflakeV2Config.parse_obj(default_config_dict) assert config.account_id == "acctname" def test_snowflake_uri_default_authentication(): - config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - ) - + config = SnowflakeV2Config.parse_obj(default_config_dict) assert config.get_sql_alchemy_url() == ( "snowflake://user:password@acctname" "?application=acryl_datahub" @@ -191,17 +163,10 @@ def test_snowflake_uri_default_authentication(): def test_snowflake_uri_external_browser_authentication(): - config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - "authentication_type": "EXTERNAL_BROWSER_AUTHENTICATOR", - } - ) - + config_dict = default_config_dict.copy() + del config_dict["password"] + config_dict["authentication_type"] = "EXTERNAL_BROWSER_AUTHENTICATOR" + config = SnowflakeV2Config.parse_obj(config_dict) assert config.get_sql_alchemy_url() == ( "snowflake://user@acctname" "?application=acryl_datahub" @@ -212,18 +177,12 @@ def test_snowflake_uri_external_browser_authentication(): def test_snowflake_uri_key_pair_authentication(): - config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - "authentication_type": "KEY_PAIR_AUTHENTICATOR", - "private_key_path": "/a/random/path", - "private_key_password": "a_random_password", - } - ) + config_dict = default_config_dict.copy() + del config_dict["password"] + config_dict["authentication_type"] = "KEY_PAIR_AUTHENTICATOR" + config_dict["private_key_path"] = "/a/random/path" + config_dict["private_key_password"] = "a_random_password" + config = SnowflakeV2Config.parse_obj(config_dict) assert config.get_sql_alchemy_url() == ( "snowflake://user@acctname" @@ -235,63 +194,35 @@ def test_snowflake_uri_key_pair_authentication(): def test_options_contain_connect_args(): - config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - ) + config = SnowflakeV2Config.parse_obj(default_config_dict) connect_args = config.get_options().get("connect_args") assert connect_args is not None def test_snowflake_config_with_view_lineage_no_table_lineage_throws_error(): - with pytest.raises(ValidationError): - SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - "include_view_lineage": True, - "include_table_lineage": False, - } - ) + config_dict = default_config_dict.copy() + config_dict["include_view_lineage"] = True + config_dict["include_table_lineage"] = False + with pytest.raises( + ValidationError, + match="include_table_lineage must be True for include_view_lineage to be set", + ): + SnowflakeV2Config.parse_obj(config_dict) def test_snowflake_config_with_column_lineage_no_table_lineage_throws_error(): - with pytest.raises(ValidationError): - SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - "include_column_lineage": True, - "include_table_lineage": False, - } - ) + config_dict = default_config_dict.copy() + config_dict["include_column_lineage"] = True + config_dict["include_table_lineage"] = False + with pytest.raises( + ValidationError, + match="include_table_lineage must be True for include_column_lineage to be set", + ): + SnowflakeV2Config.parse_obj(config_dict) def test_snowflake_config_with_no_connect_args_returns_base_connect_args(): - config: SnowflakeV2Config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - ) + config: SnowflakeV2Config = SnowflakeV2Config.parse_obj(default_config_dict) assert config.get_options()["connect_args"] is not None assert config.get_options()["connect_args"] == { CLIENT_PREFETCH_THREADS: 10, @@ -300,7 +231,10 @@ def test_snowflake_config_with_no_connect_args_returns_base_connect_args(): def test_private_key_set_but_auth_not_changed(): - with pytest.raises(ValidationError): + with pytest.raises( + ValidationError, + match="Either `private_key` and `private_key_path` is set but `authentication_type` is DEFAULT_AUTHENTICATOR. Should be set to 'KEY_PAIR_AUTHENTICATOR' when using key pair authentication", + ): SnowflakeV2Config.parse_obj( { "account_id": "acctname", @@ -310,19 +244,11 @@ def test_private_key_set_but_auth_not_changed(): def test_snowflake_config_with_connect_args_overrides_base_connect_args(): - config: SnowflakeV2Config = SnowflakeV2Config.parse_obj( - { - "username": "user", - "password": "password", - "account_id": "acctname", - "database_pattern": {"allow": {"^demo$"}}, - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - "connect_args": { - CLIENT_PREFETCH_THREADS: 5, - }, - } - ) + config_dict = default_config_dict.copy() + config_dict["connect_args"] = { + CLIENT_PREFETCH_THREADS: 5, + } + config: SnowflakeV2Config = SnowflakeV2Config.parse_obj(config_dict) assert config.get_options()["connect_args"] is not None assert config.get_options()["connect_args"][CLIENT_PREFETCH_THREADS] == 5 assert config.get_options()["connect_args"][CLIENT_SESSION_KEEP_ALIVE] is True @@ -331,35 +257,20 @@ def test_snowflake_config_with_connect_args_overrides_base_connect_args(): @patch("snowflake.connector.connect") def test_test_connection_failure(mock_connect): mock_connect.side_effect = Exception("Failed to connect to snowflake") - config = { - "username": "user", - "password": "password", - "account_id": "missing", - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - report = SnowflakeV2Source.test_connection(config) - assert report is not None - assert report.basic_connectivity - assert not report.basic_connectivity.capable - assert report.basic_connectivity.failure_reason - assert "Failed to connect to snowflake" in report.basic_connectivity.failure_reason + report = test_connection_helpers.run_test_connection( + SnowflakeV2Source, default_config_dict + ) + test_connection_helpers.assert_basic_connectivity_failure( + report, "Failed to connect to snowflake" + ) @patch("snowflake.connector.connect") def test_test_connection_basic_success(mock_connect): - config = { - "username": "user", - "password": "password", - "account_id": "missing", - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - report = SnowflakeV2Source.test_connection(config) - assert report is not None - assert report.basic_connectivity - assert report.basic_connectivity.capable - assert report.basic_connectivity.failure_reason is None + report = test_connection_helpers.run_test_connection( + SnowflakeV2Source, default_config_dict + ) + test_connection_helpers.assert_basic_connectivity_success(report) def setup_mock_connect(mock_connect, query_results=None): @@ -400,31 +311,18 @@ def query_results(query): return [] raise ValueError(f"Unexpected query: {query}") - config = { - "username": "user", - "password": "password", - "account_id": "missing", - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } setup_mock_connect(mock_connect, query_results) - report = SnowflakeV2Source.test_connection(config) - assert report is not None - assert report.basic_connectivity - assert report.basic_connectivity.capable - assert report.basic_connectivity.failure_reason is None - - assert report.capability_report - assert report.capability_report[SourceCapability.CONTAINERS].capable - assert not report.capability_report[SourceCapability.SCHEMA_METADATA].capable - failure_reason = report.capability_report[ - SourceCapability.SCHEMA_METADATA - ].failure_reason - assert failure_reason - - assert ( - "Current role TEST_ROLE does not have permissions to use warehouse" - in failure_reason + report = test_connection_helpers.run_test_connection( + SnowflakeV2Source, default_config_dict + ) + test_connection_helpers.assert_basic_connectivity_success(report) + + test_connection_helpers.assert_capability_report( + capability_report=report.capability_report, + success_capabilities=[SourceCapability.CONTAINERS], + failure_capabilities={ + SourceCapability.SCHEMA_METADATA: "Current role TEST_ROLE does not have permissions to use warehouse" + }, ) @@ -445,25 +343,17 @@ def query_results(query): setup_mock_connect(mock_connect, query_results) - config = { - "username": "user", - "password": "password", - "account_id": "missing", - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - report = SnowflakeV2Source.test_connection(config) - assert report is not None - assert report.basic_connectivity - assert report.basic_connectivity.capable - assert report.basic_connectivity.failure_reason is None - assert report.capability_report - - assert report.capability_report[SourceCapability.CONTAINERS].capable - assert not report.capability_report[SourceCapability.SCHEMA_METADATA].capable - assert ( - report.capability_report[SourceCapability.SCHEMA_METADATA].failure_reason - is not None + report = test_connection_helpers.run_test_connection( + SnowflakeV2Source, default_config_dict + ) + test_connection_helpers.assert_basic_connectivity_success(report) + + test_connection_helpers.assert_capability_report( + capability_report=report.capability_report, + success_capabilities=[SourceCapability.CONTAINERS], + failure_capabilities={ + SourceCapability.SCHEMA_METADATA: "Either no tables exist or current role does not have permissions to access them" + }, ) @@ -488,24 +378,19 @@ def query_results(query): setup_mock_connect(mock_connect, query_results) - config = { - "username": "user", - "password": "password", - "account_id": "missing", - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - report = SnowflakeV2Source.test_connection(config) - - assert report is not None - assert report.basic_connectivity - assert report.basic_connectivity.capable - assert report.basic_connectivity.failure_reason is None - assert report.capability_report - - assert report.capability_report[SourceCapability.CONTAINERS].capable - assert report.capability_report[SourceCapability.SCHEMA_METADATA].capable - assert report.capability_report[SourceCapability.DESCRIPTIONS].capable + report = test_connection_helpers.run_test_connection( + SnowflakeV2Source, default_config_dict + ) + test_connection_helpers.assert_basic_connectivity_success(report) + + test_connection_helpers.assert_capability_report( + capability_report=report.capability_report, + success_capabilities=[ + SourceCapability.CONTAINERS, + SourceCapability.SCHEMA_METADATA, + SourceCapability.DESCRIPTIONS, + ], + ) @patch("snowflake.connector.connect") @@ -538,25 +423,21 @@ def query_results(query): setup_mock_connect(mock_connect, query_results) - config = { - "username": "user", - "password": "password", - "account_id": "missing", - "warehouse": "COMPUTE_WH", - "role": "sysadmin", - } - report = SnowflakeV2Source.test_connection(config) - assert report is not None - assert report.basic_connectivity - assert report.basic_connectivity.capable - assert report.basic_connectivity.failure_reason is None - assert report.capability_report - - assert report.capability_report[SourceCapability.CONTAINERS].capable - assert report.capability_report[SourceCapability.SCHEMA_METADATA].capable - assert report.capability_report[SourceCapability.DATA_PROFILING].capable - assert report.capability_report[SourceCapability.DESCRIPTIONS].capable - assert report.capability_report[SourceCapability.LINEAGE_COARSE].capable + report = test_connection_helpers.run_test_connection( + SnowflakeV2Source, default_config_dict + ) + test_connection_helpers.assert_basic_connectivity_success(report) + + test_connection_helpers.assert_capability_report( + capability_report=report.capability_report, + success_capabilities=[ + SourceCapability.CONTAINERS, + SourceCapability.SCHEMA_METADATA, + SourceCapability.DATA_PROFILING, + SourceCapability.DESCRIPTIONS, + SourceCapability.LINEAGE_COARSE, + ], + ) def test_aws_cloud_region_from_snowflake_region_id(): @@ -610,11 +491,10 @@ def test_azure_cloud_region_from_snowflake_region_id(): def test_unknown_cloud_region_from_snowflake_region_id(): - with pytest.raises(Exception) as e: + with pytest.raises(Exception, match="Unknown snowflake region"): SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( "somecloud_someregion" ) - assert "Unknown snowflake region" in str(e) def test_snowflake_object_access_entry_missing_object_id(): diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index e23d290b611f4c..a98bf641711220 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -1,8 +1,7 @@ from typing import Dict -from unittest.mock import Mock +from unittest import mock import pytest -from sqlalchemy.engine.reflection import Inspector from datahub.ingestion.source.sql.sql_common import PipelineContext, SQLAlchemySource from datahub.ingestion.source.sql.sql_config import SQLCommonConfig @@ -13,19 +12,24 @@ class _TestSQLAlchemyConfig(SQLCommonConfig): def get_sql_alchemy_url(self): - pass + return "mysql+pymysql://user:pass@localhost:5330" class _TestSQLAlchemySource(SQLAlchemySource): - pass + @classmethod + def create(cls, config_dict, ctx): + config = _TestSQLAlchemyConfig.parse_obj(config_dict) + return cls(config, ctx, "TEST") + + +def get_test_sql_alchemy_source(): + return _TestSQLAlchemySource.create( + config_dict={}, ctx=PipelineContext(run_id="test_ctx") + ) def test_generate_foreign_key(): - config: SQLCommonConfig = _TestSQLAlchemyConfig() - ctx: PipelineContext = PipelineContext(run_id="test_ctx") - platform: str = "TEST" - inspector: Inspector = Mock() - source = _TestSQLAlchemySource(config=config, ctx=ctx, platform=platform) + source = get_test_sql_alchemy_source() fk_dict: Dict[str, str] = { "name": "test_constraint", "referred_table": "test_table", @@ -37,7 +41,7 @@ def test_generate_foreign_key(): dataset_urn="test_urn", schema="test_schema", fk_dict=fk_dict, - inspector=inspector, + inspector=mock.Mock(), ) assert fk_dict.get("name") == foreign_key.name @@ -48,11 +52,7 @@ def test_generate_foreign_key(): def test_use_source_schema_for_foreign_key_if_not_specified(): - config: SQLCommonConfig = _TestSQLAlchemyConfig() - ctx: PipelineContext = PipelineContext(run_id="test_ctx") - platform: str = "TEST" - inspector: Inspector = Mock() - source = _TestSQLAlchemySource(config=config, ctx=ctx, platform=platform) + source = get_test_sql_alchemy_source() fk_dict: Dict[str, str] = { "name": "test_constraint", "referred_table": "test_table", @@ -63,7 +63,7 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): dataset_urn="test_urn", schema="test_schema", fk_dict=fk_dict, - inspector=inspector, + inspector=mock.Mock(), ) assert fk_dict.get("name") == foreign_key.name @@ -105,14 +105,32 @@ def test_get_platform_from_sqlalchemy_uri(uri: str, expected_platform: str) -> N def test_get_db_schema_with_dots_in_view_name(): - config: SQLCommonConfig = _TestSQLAlchemyConfig() - ctx: PipelineContext = PipelineContext(run_id="test_ctx") - platform: str = "TEST" - source = _TestSQLAlchemySource(config=config, ctx=ctx, platform=platform) - + source = get_test_sql_alchemy_source() database, schema = source.get_db_schema( dataset_identifier="database.schema.long.view.name1" ) - assert database == "database" assert schema == "schema" + + +def test_test_connection_success(): + source = get_test_sql_alchemy_source() + with mock.patch( + "datahub.ingestion.source.sql.sql_common.SQLAlchemySource.get_inspectors", + side_effect=lambda: [], + ): + report = source.test_connection({}) + assert report is not None + assert report.basic_connectivity + assert report.basic_connectivity.capable + assert report.basic_connectivity.failure_reason is None + + +def test_test_connection_failure(): + source = get_test_sql_alchemy_source() + report = source.test_connection({}) + assert report is not None + assert report.basic_connectivity + assert not report.basic_connectivity.capable + assert report.basic_connectivity.failure_reason + assert "Connection refused" in report.basic_connectivity.failure_reason From 26114dfeb2d255f1b2a562396908f48c8dd0ad64 Mon Sep 17 00:00:00 2001 From: naoki kuroda <68233204+nnnkkk7@users.noreply.github.com> Date: Fri, 15 Dec 2023 05:42:45 +0900 Subject: [PATCH 15/17] docs: fix sample command for container logs (#9427) --- docs/how/extract-container-logs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/how/extract-container-logs.md b/docs/how/extract-container-logs.md index 9251d0665c02cf..b5fbb4c83cc645 100644 --- a/docs/how/extract-container-logs.md +++ b/docs/how/extract-container-logs.md @@ -86,7 +86,7 @@ Depending on your issue, you may be interested to view both debug and normal inf Since log files are named based on the current date, you'll need to use "ls" to see which files currently exist. To do so, you can use the `kubectl exec` command, using the pod name recorded in step one: ``` -kubectl exec datahub-frontend-1231ead-6767 -n default -- ls -la /tmp/datahub/logs/gms +kubectl exec datahub-gms-c578b47cd-7676 -n default -- ls -la /tmp/datahub/logs/gms total 36388 drwxr-xr-x 2 datahub datahub 4096 Jul 29 07:45 . @@ -131,5 +131,5 @@ Now you should be able to view the logs locally. There are a few ways to get files out of the pod and into a local file. You can either use `kubectl cp` or simply `cat` and pipe the file of interest. We'll show an example using the latter approach: ``` -kubectl exec datahub-frontend-1231ead-6767 -n default -- cat /tmp/datahub/logs/gms/gms.log > my-local-gms.log +kubectl exec datahub-gms-c578b47cd-7676 -n default -- cat /tmp/datahub/logs/gms/gms.log > my-local-gms.log ``` \ No newline at end of file From 4354af20126d1befb2c7391c23310a4eca5bb688 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 14 Dec 2023 16:54:40 -0500 Subject: [PATCH 16/17] fix(ingest): bump source configs json schema version (#9424) --- docs-website/genJsonSchema/gen_json_schema.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs-website/genJsonSchema/gen_json_schema.py b/docs-website/genJsonSchema/gen_json_schema.py index 81c1d5a2c1a30f..4af72487644bd6 100644 --- a/docs-website/genJsonSchema/gen_json_schema.py +++ b/docs-website/genJsonSchema/gen_json_schema.py @@ -7,7 +7,7 @@ def get_base() -> Any: return { - "$schema": "http://json-schema.org/draft-04/schema#", + "$schema": "https://json-schema.org/draft/2020-12/schema", "id": "https://json.schemastore.org/datahub-ingestion", "title": "Datahub Ingestion", "description": "Root schema of Datahub Ingestion", @@ -116,7 +116,7 @@ def get_base() -> Any: "bootstrap": { "type": "string", "description": "Kafka bootstrap URL.", - "default": "localhost:9092" + "default": "localhost:9092", }, "producer_config": { "type": "object", @@ -125,7 +125,7 @@ def get_base() -> Any: "schema_registry_url": { "type": "string", "description": "URL of schema registry being used.", - "default": "http://localhost:8081" + "default": "http://localhost:8081", }, "schema_registry_config": { "type": "object", From 0ea6145a9d491a1b882ba5a7a4667fb323d31dc4 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Fri, 15 Dec 2023 00:12:45 +0100 Subject: [PATCH 17/17] fix(ingest/profiling): Add option to enable external table profiling (#9463) --- .../datahub/ingestion/source/ge_profiling_config.py | 5 +++++ .../src/datahub/ingestion/source/redshift/profile.py | 9 +++++++++ .../ingestion/source/snowflake/snowflake_profiler.py | 10 ++++++++++ .../ingestion/source/snowflake/snowflake_schema.py | 3 +++ .../ingestion/source/sql/sql_generic_profiler.py | 3 +++ .../tests/integration/snowflake/common.py | 1 + 6 files changed, 31 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 24a3e520d8caff..f340a7b41b7af8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -167,6 +167,11 @@ class GEProfilingConfig(ConfigModel): "Applicable only if `use_sampling` is set to True.", ) + profile_external_tables: bool = Field( + default=False, + description="Whether to profile external tables. Only Snowflake and Redshift supports this.", + ) + @pydantic.root_validator(pre=True) def deprecate_bigquery_temp_table_schema(cls, values): # TODO: Update docs to remove mention of this field. diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py index b05850cef6e948..eed82ec4d83e76 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py @@ -48,6 +48,15 @@ def get_workunits( if not self.config.schema_pattern.allowed(schema): continue for table in tables[db].get(schema, {}): + if ( + not self.config.profiling.profile_external_tables + and table.type == "EXTERNAL_TABLE" + ): + self.report.profiling_skipped_other[schema] += 1 + logger.info( + f"Skipping profiling of external table {db}.{schema}.{table.name}" + ) + continue # Emit the profile work unit profile_request = self.get_profile_request(table, schema, db) if profile_request is not None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 89857c45642678..4bda7da422e9d6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -50,6 +50,16 @@ def get_workunits( profile_requests = [] for schema in database.schemas: for table in db_tables[schema.name]: + if ( + not self.config.profiling.profile_external_tables + and table.type == "EXTERNAL TABLE" + ): + logger.info( + f"Skipping profiling of external table {database.name}.{schema.name}.{table.name}" + ) + self.report.profiling_skipped_other[schema.name] += 1 + continue + profile_request = self.get_profile_request( table, schema.name, database.name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index e5b214ba35e4b6..9526bdec4b05dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -77,6 +77,7 @@ def get_precise_native_type(self): @dataclass class SnowflakeTable(BaseTable): + type: Optional[str] = None clustering_key: Optional[str] = None pk: Optional[SnowflakePK] = None columns: List[SnowflakeColumn] = field(default_factory=list) @@ -265,6 +266,7 @@ def get_tables_for_database( tables[table["TABLE_SCHEMA"]].append( SnowflakeTable( name=table["TABLE_NAME"], + type=table["TABLE_TYPE"], created=table["CREATED"], last_altered=table["LAST_ALTERED"], size_in_bytes=table["BYTES"], @@ -288,6 +290,7 @@ def get_tables_for_schema( tables.append( SnowflakeTable( name=table["TABLE_NAME"], + type=table["TABLE_TYPE"], created=table["CREATED"], last_altered=table["LAST_ALTERED"], size_in_bytes=table["BYTES"], diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index a2f91e5fae1a98..30fad9ad584c12 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -35,6 +35,9 @@ class DetailedProfilerReportMixin: profiling_skipped_row_limit: TopKDict[str, int] = field( default_factory=int_top_k_dict ) + + profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict) + num_tables_not_eligible_profiling: Dict[str, int] = field( default_factory=int_top_k_dict ) diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index b21cea5f0988d0..53b87636068bfe 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -79,6 +79,7 @@ def default_query_results( # noqa: C901 { "TABLE_SCHEMA": "TEST_SCHEMA", "TABLE_NAME": "TABLE_{}".format(tbl_idx), + "TABLE_TYPE": "BASE TABLE", "CREATED": datetime(2021, 6, 8, 0, 0, 0, 0), "LAST_ALTERED": datetime(2021, 6, 8, 0, 0, 0, 0), "BYTES": 1024,