From fdbcb684acca9d579ac9c72bb2e4f454d6b03b2b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 26 Jul 2024 14:02:25 -0700 Subject: [PATCH 1/3] feat(ece): support custom ownership type urns in ECE generation (#10999) --- .../data/entity/OwnerChangeEvent.java | 20 +++++++++++--- .../OwnershipChangeEventGenerator.java | 5 ++++ .../EntityChangeEventGeneratorHookTest.java | 26 +++++++++++++++++-- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/entity/OwnerChangeEvent.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/entity/OwnerChangeEvent.java index fc4f0327b77042..a14b73d8f94b0b 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/entity/OwnerChangeEvent.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/entity/OwnerChangeEvent.java @@ -27,17 +27,29 @@ public OwnerChangeEvent( SemanticChangeType semVerChange, String description, Urn ownerUrn, - OwnershipType ownerType) { + OwnershipType ownerType, + Urn ownerTypeUrn) { super( entityUrn, category, operation, modifier, - ImmutableMap.of( - "ownerUrn", ownerUrn.toString(), - "ownerType", ownerType.toString()), + buildParameters(ownerUrn, ownerType, ownerTypeUrn), auditStamp, semVerChange, description); } + + private static ImmutableMap buildParameters( + Urn ownerUrn, OwnershipType ownerType, Urn ownerTypeUrn) { + ImmutableMap.Builder builder = + new ImmutableMap.Builder() + .put("ownerUrn", ownerUrn.toString()) + .put("ownerType", ownerType.toString()); + if (ownerTypeUrn != null) { + builder.put("ownerTypeUrn", ownerTypeUrn.toString()); + } + + return builder.build(); + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/OwnershipChangeEventGenerator.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/OwnershipChangeEventGenerator.java index b32958508cf240..1ef5d0f20da5a9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/OwnershipChangeEventGenerator.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/OwnershipChangeEventGenerator.java @@ -62,6 +62,7 @@ private static List computeDiffs( entityUrn)) .ownerUrn(targetOwner.getOwner()) .ownerType(targetOwner.getType()) + .ownerTypeUrn(targetOwner.getTypeUrn()) .auditStamp(auditStamp) .build()); } @@ -84,6 +85,7 @@ private static List computeDiffs( entityUrn)) .ownerUrn(baseOwner.getOwner()) .ownerType(baseOwner.getType()) + .ownerTypeUrn(baseOwner.getTypeUrn()) .auditStamp(auditStamp) .build()); ++baseOwnerIdx; @@ -104,6 +106,7 @@ private static List computeDiffs( entityUrn)) .ownerUrn(targetOwner.getOwner()) .ownerType(targetOwner.getType()) + .ownerTypeUrn(targetOwner.getTypeUrn()) .auditStamp(auditStamp) .build()); ++targetOwnerIdx; @@ -128,6 +131,7 @@ private static List computeDiffs( entityUrn)) .ownerUrn(baseOwner.getOwner()) .ownerType(baseOwner.getType()) + .ownerTypeUrn(baseOwner.getTypeUrn()) .auditStamp(auditStamp) .build()); ++baseOwnerIdx; @@ -150,6 +154,7 @@ private static List computeDiffs( entityUrn)) .ownerUrn(targetOwner.getOwner()) .ownerType(targetOwner.getType()) + .ownerTypeUrn(targetOwner.getTypeUrn()) .auditStamp(auditStamp) .build()); ++targetOwnerIdx; diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java index b06b7df1846bde..3897d663c96818 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java @@ -309,11 +309,16 @@ public void testInvokeEntityOwnerChange() throws Exception { final Ownership newOwners = new Ownership(); final Urn ownerUrn1 = Urn.createFromString("urn:li:corpuser:test1"); final Urn ownerUrn2 = Urn.createFromString("urn:li:corpuser:test2"); + final Urn ownerUrn3 = Urn.createFromString("urn:li:corpuser:test3"); newOwners.setOwners( new OwnerArray( ImmutableList.of( new Owner().setOwner(ownerUrn1).setType(OwnershipType.TECHNICAL_OWNER), - new Owner().setOwner(ownerUrn2).setType(OwnershipType.BUSINESS_OWNER)))); + new Owner().setOwner(ownerUrn2).setType(OwnershipType.BUSINESS_OWNER), + new Owner() + .setOwner(ownerUrn3) + .setType(OwnershipType.CUSTOM) + .setTypeUrn(Urn.createFromString("urn:li:ownershipType:my_custom_type"))))); final Ownership prevOwners = new Ownership(); prevOwners.setOwners(new OwnerArray()); event.setAspect(GenericRecordUtils.serializeAspect(newOwners)); @@ -354,7 +359,24 @@ public void testInvokeEntityOwnerChange() throws Exception { "ownerType", OwnershipType.BUSINESS_OWNER.toString()), actorUrn); - verifyProducePlatformEvent(_mockClient, platformEvent2, true); + verifyProducePlatformEvent(_mockClient, platformEvent2, false); + + PlatformEvent platformEvent3 = + createChangeEvent( + DATASET_ENTITY_NAME, + Urn.createFromString(TEST_DATASET_URN), + ChangeCategory.OWNER, + ChangeOperation.ADD, + ownerUrn3.toString(), + ImmutableMap.of( + "ownerUrn", + ownerUrn3.toString(), + "ownerType", + OwnershipType.CUSTOM.toString(), + "ownerTypeUrn", + "urn:li:ownershipType:my_custom_type"), + actorUrn); + verifyProducePlatformEvent(_mockClient, platformEvent3, true); } @Test From d85da39a868bf819dd81f78d051e1ab0e19ef7b0 Mon Sep 17 00:00:00 2001 From: amit-apptware <132869468+amit-apptware@users.noreply.github.com> Date: Sat, 27 Jul 2024 02:54:23 +0530 Subject: [PATCH 2/3] feat(assertion-v2): changed Validation tab to Quality and created new Governance tab (#10935) --- datahub-web-react/src/app/ProtectedRoutes.tsx | 17 +++- .../src/app/entity/dataset/DatasetEntity.tsx | 17 +++- .../profile/header/EntityHealth.tsx | 2 +- .../tabs/Dataset/Governance/GovernanceTab.tsx | 85 +++++++++++++++++++ .../TestResults.tsx | 0 .../TestResultsList.tsx | 0 .../TestResultsSummary.tsx | 0 .../{Validations => Governance}/testUtils.tsx | 0 .../Validations/DatasetAssertionsList.tsx | 4 +- .../Dataset/Validations/ValidationsTab.tsx | 20 +---- .../__tests__/useGetValidationsTab.test.ts | 24 +++--- .../src/app/shared/health/healthUtils.tsx | 2 +- datahub-web-react/src/conf/Global.ts | 8 ++ datahub-web-react/src/conf/utils.ts | 26 ++++++ .../cypress/e2e/mutations/dataset_health.js | 4 +- 15 files changed, 166 insertions(+), 43 deletions(-) create mode 100644 datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/GovernanceTab.tsx rename datahub-web-react/src/app/entity/shared/tabs/Dataset/{Validations => Governance}/TestResults.tsx (100%) rename datahub-web-react/src/app/entity/shared/tabs/Dataset/{Validations => Governance}/TestResultsList.tsx (100%) rename datahub-web-react/src/app/entity/shared/tabs/Dataset/{Validations => Governance}/TestResultsSummary.tsx (100%) rename datahub-web-react/src/app/entity/shared/tabs/Dataset/{Validations => Governance}/testUtils.tsx (100%) create mode 100644 datahub-web-react/src/conf/utils.ts diff --git a/datahub-web-react/src/app/ProtectedRoutes.tsx b/datahub-web-react/src/app/ProtectedRoutes.tsx index 0e4a1a260f5532..d975e6d4d99c2d 100644 --- a/datahub-web-react/src/app/ProtectedRoutes.tsx +++ b/datahub-web-react/src/app/ProtectedRoutes.tsx @@ -1,15 +1,26 @@ -import React from 'react'; -import { Switch, Route } from 'react-router-dom'; +import React, { useEffect } from 'react'; +import { Switch, Route, useLocation, useHistory } from 'react-router-dom'; import { Layout } from 'antd'; import { HomePage } from './home/HomePage'; import { SearchRoutes } from './SearchRoutes'; import EmbedRoutes from './EmbedRoutes'; -import { PageRoutes } from '../conf/Global'; +import { NEW_ROUTE_MAP, PageRoutes } from '../conf/Global'; +import { getRedirectUrl } from '../conf/utils'; /** * Container for all views behind an authentication wall. */ export const ProtectedRoutes = (): JSX.Element => { + const location = useLocation(); + const history = useHistory(); + + useEffect(() => { + if (location.pathname.indexOf('/Validation') !== -1) { + history.replace(getRedirectUrl(NEW_ROUTE_MAP)); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [location]); + return ( diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index 6074bcc2f2f406..c30fee7abc0b6d 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -36,6 +36,7 @@ import AccessManagement from '../shared/tabs/Dataset/AccessManagement/AccessMana import { matchedFieldPathsRenderer } from '../../search/matches/matchedFieldPathsRenderer'; import { getLastUpdatedMs } from './shared/utils'; import { IncidentTab } from '../shared/tabs/Incident/IncidentTab'; +import { GovernanceTab } from '../shared/tabs/Dataset/Governance/GovernanceTab'; const SUBTYPES = { VIEW: 'view', @@ -166,14 +167,22 @@ export class DatasetEntity implements Entity { }, }, { - name: 'Validation', + name: 'Quality', component: ValidationsTab, display: { visible: (_, _1) => true, enabled: (_, dataset: GetDatasetQuery) => { - return ( - (dataset?.dataset?.assertions?.total || 0) > 0 || dataset?.dataset?.testResults !== null - ); + return (dataset?.dataset?.assertions?.total || 0) > 0; + }, + }, + }, + { + name: 'Governance', + component: GovernanceTab, + display: { + visible: (_, _1) => true, + enabled: (_, dataset: GetDatasetQuery) => { + return dataset?.dataset?.testResults !== null; }, }, }, diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx index 30713afa888b84..ad14d92a3915ae 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx @@ -24,7 +24,7 @@ export const EntityHealth = ({ health, baseUrl, fontSize, tooltipPlacement }: Pr return ( <> {(unhealthy && ( - + {icon} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/GovernanceTab.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/GovernanceTab.tsx new file mode 100644 index 00000000000000..213716ed501c5b --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/GovernanceTab.tsx @@ -0,0 +1,85 @@ +import React, { useEffect } from 'react'; +import { Button } from 'antd'; +import { useHistory, useLocation } from 'react-router'; +import styled from 'styled-components'; +import { FileDoneOutlined } from '@ant-design/icons'; +import { useEntityData } from '../../../EntityContext'; +import { TestResults } from './TestResults'; +import TabToolbar from '../../../components/styled/TabToolbar'; +import { ANTD_GRAY } from '../../../constants'; +import { useGetValidationsTab } from '../Validations/useGetValidationsTab'; + +const TabTitle = styled.span` + margin-left: 4px; +`; + +const TabButton = styled(Button)<{ selected: boolean }>` + background-color: ${(props) => (props.selected && ANTD_GRAY[3]) || 'none'}; + margin-left: 4px; +`; + +enum TabPaths { + TESTS = 'Tests', +} + +const DEFAULT_TAB = TabPaths.TESTS; + +/** + * Component used for rendering the Entity Governance Tab. + */ +export const GovernanceTab = () => { + const { entityData } = useEntityData(); + const history = useHistory(); + const { pathname } = useLocation(); + + const passingTests = (entityData as any)?.testResults?.passing || []; + const maybeFailingTests = (entityData as any)?.testResults?.failing || []; + const totalTests = maybeFailingTests.length + passingTests.length; + + const { selectedTab, basePath } = useGetValidationsTab(pathname, Object.values(TabPaths)); + + // If no tab was selected, select a default tab. + useEffect(() => { + if (!selectedTab) { + // Route to the default tab. + history.replace(`${basePath}/${DEFAULT_TAB}`); + } + }, [selectedTab, basePath, history]); + + /** + * The top-level Toolbar tabs to display. + */ + const tabs = [ + { + title: ( + <> + + Tests ({totalTests}) + + ), + path: TabPaths.TESTS, + disabled: totalTests === 0, + content: , + }, + ]; + + return ( + <> + +
+ {tabs.map((tab) => ( + history.replace(`${basePath}/${tab.path}`)} + > + {tab.title} + + ))} +
+
+ {tabs.filter((tab) => tab.path === selectedTab).map((tab) => tab.content)} + + ); +}; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/TestResults.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/TestResults.tsx similarity index 100% rename from datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/TestResults.tsx rename to datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/TestResults.tsx diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/TestResultsList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/TestResultsList.tsx similarity index 100% rename from datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/TestResultsList.tsx rename to datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/TestResultsList.tsx diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/TestResultsSummary.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/TestResultsSummary.tsx similarity index 100% rename from datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/TestResultsSummary.tsx rename to datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/TestResultsSummary.tsx diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/testUtils.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/testUtils.tsx similarity index 100% rename from datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/testUtils.tsx rename to datahub-web-react/src/app/entity/shared/tabs/Dataset/Governance/testUtils.tsx diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx index 4d0e475d5dad14..bfcb30b6c5e7ac 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx @@ -188,7 +188,7 @@ export const DatasetAssertionsList = ({ to={`${entityRegistry.getEntityUrl( EntityType.Dataset, entityData.urn, - )}/Validation/Data Contract`} + )}/Quality/Data Contract`} style={{ color: REDESIGN_COLORS.BLUE }} > view @@ -200,7 +200,7 @@ export const DatasetAssertionsList = ({ to={`${entityRegistry.getEntityUrl( EntityType.Dataset, entityData.urn, - )}/Validation/Data Contract`} + )}/Quality/Data Contract`} > diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/ValidationsTab.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/ValidationsTab.tsx index 92af9bfc2b567b..006823db53fd44 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/ValidationsTab.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/ValidationsTab.tsx @@ -2,9 +2,8 @@ import React, { useEffect } from 'react'; import { Button } from 'antd'; import { useHistory, useLocation } from 'react-router'; import styled from 'styled-components'; -import { AuditOutlined, FileDoneOutlined, FileProtectOutlined } from '@ant-design/icons'; +import { AuditOutlined, FileProtectOutlined } from '@ant-design/icons'; import { useEntityData } from '../../../EntityContext'; -import { TestResults } from './TestResults'; import { Assertions } from './Assertions'; import TabToolbar from '../../../components/styled/TabToolbar'; import { useGetValidationsTab } from './useGetValidationsTab'; @@ -22,8 +21,7 @@ const TabButton = styled(Button)<{ selected: boolean }>` `; enum TabPaths { - ASSERTIONS = 'Assertions', - TESTS = 'Tests', + ASSERTIONS = 'List', DATA_CONTRACT = 'Data Contract', } @@ -39,9 +37,6 @@ export const ValidationsTab = () => { const appConfig = useAppConfig(); const totalAssertions = (entityData as any)?.assertions?.total; - const passingTests = (entityData as any)?.testResults?.passing || []; - const maybeFailingTests = (entityData as any)?.testResults?.failing || []; - const totalTests = maybeFailingTests.length + passingTests.length; const { selectedTab, basePath } = useGetValidationsTab(pathname, Object.values(TabPaths)); @@ -68,17 +63,6 @@ export const ValidationsTab = () => { disabled: totalAssertions === 0, content: , }, - { - title: ( - <> - - Tests ({totalTests}) - - ), - path: TabPaths.TESTS, - disabled: totalTests === 0, - content: , - }, ]; if (appConfig.config.featureFlags?.dataContractsEnabled) { diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/__tests__/useGetValidationsTab.test.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/__tests__/useGetValidationsTab.test.ts index 52689a225eae15..f65c337215ed90 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/__tests__/useGetValidationsTab.test.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/__tests__/useGetValidationsTab.test.ts @@ -2,37 +2,37 @@ import { useGetValidationsTab } from '../useGetValidationsTab'; describe('useGetValidationsTab', () => { it('should correctly extract valid tab', () => { - const pathname = '/dataset/urn:li:abc/Validation/Assertions'; - const tabNames = ['Assertions']; + const pathname = '/dataset/urn:li:abc/Quality/List'; + const tabNames = ['List']; const res = useGetValidationsTab(pathname, tabNames); - expect(res.selectedTab).toEqual('Assertions'); - expect(res.basePath).toEqual('/dataset/urn:li:abc/Validation'); + expect(res.selectedTab).toEqual('List'); + expect(res.basePath).toEqual('/dataset/urn:li:abc/Quality'); }); it('should extract undefined for invalid tab', () => { - const pathname = '/dataset/urn:li:abc/Validation/Assertions'; + const pathname = '/dataset/urn:li:abc/Quality/Assertions'; const tabNames = ['Tests']; const res = useGetValidationsTab(pathname, tabNames); expect(res.selectedTab).toBeUndefined(); - expect(res.basePath).toEqual('/dataset/urn:li:abc/Validation'); + expect(res.basePath).toEqual('/dataset/urn:li:abc/Quality'); }); it('should extract undefined for missing tab', () => { - const pathname = '/dataset/urn:li:abc/Validation'; + const pathname = '/dataset/urn:li:abc/Quality'; const tabNames = ['Tests']; const res = useGetValidationsTab(pathname, tabNames); expect(res.selectedTab).toBeUndefined(); - expect(res.basePath).toEqual('/dataset/urn:li:abc/Validation'); + expect(res.basePath).toEqual('/dataset/urn:li:abc/Quality'); }); it('should handle trailing slashes', () => { - let pathname = '/dataset/urn:li:abc/Validation/Assertions/'; + let pathname = '/dataset/urn:li:abc/Quality/Assertions/'; let tabNames = ['Assertions']; let res = useGetValidationsTab(pathname, tabNames); expect(res.selectedTab).toEqual('Assertions'); - expect(res.basePath).toEqual('/dataset/urn:li:abc/Validation'); + expect(res.basePath).toEqual('/dataset/urn:li:abc/Quality'); - pathname = '/dataset/urn:li:abc/Validation/'; + pathname = '/dataset/urn:li:abc/Quality/'; tabNames = ['Assertions']; res = useGetValidationsTab(pathname, tabNames); expect(res.selectedTab).toBeUndefined(); - expect(res.basePath).toEqual('/dataset/urn:li:abc/Validation'); + expect(res.basePath).toEqual('/dataset/urn:li:abc/Quality'); }); }); diff --git a/datahub-web-react/src/app/shared/health/healthUtils.tsx b/datahub-web-react/src/app/shared/health/healthUtils.tsx index a3745da26ceea6..426e5d986ca55c 100644 --- a/datahub-web-react/src/app/shared/health/healthUtils.tsx +++ b/datahub-web-react/src/app/shared/health/healthUtils.tsx @@ -145,7 +145,7 @@ export const getHealthIcon = (type: HealthStatusType, status: HealthStatus, font export const getHealthRedirectPath = (type: HealthStatusType) => { switch (type) { case HealthStatusType.Assertions: { - return 'Validation/Assertions'; + return 'Quality/List'; } case HealthStatusType.Incidents: { return 'Incidents'; diff --git a/datahub-web-react/src/conf/Global.ts b/datahub-web-react/src/conf/Global.ts index 433b0b6416e780..e0172d3f07e41e 100644 --- a/datahub-web-react/src/conf/Global.ts +++ b/datahub-web-react/src/conf/Global.ts @@ -41,3 +41,11 @@ export const CLIENT_AUTH_COOKIE = 'actor'; * Name of the unique browser id cookie generated on client side */ export const BROWSER_ID_COOKIE = 'bid'; + +/** New Routes Map for redirection */ +export const NEW_ROUTE_MAP = { + '/Validation/Assertions': '/Quality/List', + '/Validation/Tests': '/Governance/Tests', + '/Validation/Data%20Contract': '/Quality/Data%20Contract', + '/Validation': '/Quality', +}; diff --git a/datahub-web-react/src/conf/utils.ts b/datahub-web-react/src/conf/utils.ts new file mode 100644 index 00000000000000..7adb82cf71be52 --- /dev/null +++ b/datahub-web-react/src/conf/utils.ts @@ -0,0 +1,26 @@ +/** + * + * as per the new route object + * We are redirecting older routes to new one + * e.g. + * { + '/Validation/Assertions': '/Quality/List', + } + * */ + +export const getRedirectUrl = (newRoutes: { [key: string]: string }) => { + let newPathname = `${window.location.pathname}${window.location.search}`; + if (!newRoutes) { + return newPathname; + } + + // eslint-disable-next-line no-restricted-syntax + for (const path of Object.keys(newRoutes)) { + if (newPathname.indexOf(path) !== -1) { + newPathname = newPathname.replace(path, newRoutes[path]); + break; + } + } + + return `${newPathname}${window.location.search}`; +}; diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_health.js b/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_health.js index 072574e0f57aad..62ffd8a69d1c84 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_health.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_health.js @@ -7,8 +7,8 @@ describe("dataset health test", () => { cy.login(); cy.goToDataset(urn, datasetName); // Ensure that the “Health” badge is present and there is an active incident warning - cy.get(`[href="/dataset/${urn}/Validation"]`).should("be.visible"); - cy.get(`[href="/dataset/${urn}/Validation"] span`).trigger("mouseover", { + cy.get(`[href="/dataset/${urn}/Quality"]`).should("be.visible"); + cy.get(`[href="/dataset/${urn}/Quality"] span`).trigger("mouseover", { force: true, }); cy.waitTextVisible("This asset may be unhealthy"); From a09575fb6f00c9a3f372ef5938a6ac971b877f12 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:04:07 +0530 Subject: [PATCH 3/3] fix(ingestion/glue): Add support for missing config options for profiling in Glue (#10858) --- docs/how/updating-datahub.md | 13 + .../src/datahub/ingestion/source/aws/glue.py | 71 +- .../ingestion/source/glue_profiling_config.py | 8 + .../tests/unit/glue/glue_mces_golden.json | 2448 ++++++++--------- .../unit/glue/glue_mces_golden_profiling.json | 289 ++ .../tests/unit/test_glue_source.py | 83 +- .../tests/unit/test_glue_source_stubs.py | 106 + 7 files changed, 1759 insertions(+), 1259 deletions(-) create mode 100644 metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 9618aff78075cf..2821b63e7d305a 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -55,6 +55,19 @@ New (optional fields `systemMetadata` and `headers`): "headers": {} } ``` +- #10858 Profiling configuration for Glue source has been updated. + +Previously, the configuration was: +```yaml +profiling: {} +``` + +Now, it needs to be: + +```yaml +profiling: + enabled: true +``` ### Potential Downtime diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index f7e1e2610e7e28..58fb1cb1eb2c28 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -167,8 +167,8 @@ class GlueSourceConfig( default=False, description="If an S3 Objects Tags should be created for the Tables ingested by Glue.", ) - profiling: Optional[GlueProfilingConfig] = Field( - default=None, + profiling: GlueProfilingConfig = Field( + default_factory=GlueProfilingConfig, description="Configs to ingest data profiles from glue table", ) # Custom Stateful Ingestion settings @@ -186,7 +186,7 @@ class GlueSourceConfig( ) def is_profiling_enabled(self) -> bool: - return self.profiling is not None and is_profiling_enabled( + return self.profiling.enabled and is_profiling_enabled( self.profiling.operation_config ) @@ -867,34 +867,39 @@ def _create_profile_mcp( # instantiate column profile class for each column column_profile = DatasetFieldProfileClass(fieldPath=column_name) - if self.source_config.profiling.unique_count in column_params: - column_profile.uniqueCount = int( - float(column_params[self.source_config.profiling.unique_count]) - ) - if self.source_config.profiling.unique_proportion in column_params: - column_profile.uniqueProportion = float( - column_params[self.source_config.profiling.unique_proportion] - ) - if self.source_config.profiling.null_count in column_params: - column_profile.nullCount = int( - float(column_params[self.source_config.profiling.null_count]) - ) - if self.source_config.profiling.null_proportion in column_params: - column_profile.nullProportion = float( - column_params[self.source_config.profiling.null_proportion] - ) - if self.source_config.profiling.min in column_params: - column_profile.min = column_params[self.source_config.profiling.min] - if self.source_config.profiling.max in column_params: - column_profile.max = column_params[self.source_config.profiling.max] - if self.source_config.profiling.mean in column_params: - column_profile.mean = column_params[self.source_config.profiling.mean] - if self.source_config.profiling.median in column_params: - column_profile.median = column_params[ - self.source_config.profiling.median - ] - if self.source_config.profiling.stdev in column_params: - column_profile.stdev = column_params[self.source_config.profiling.stdev] + if not self.source_config.profiling.profile_table_level_only: + if self.source_config.profiling.unique_count in column_params: + column_profile.uniqueCount = int( + float(column_params[self.source_config.profiling.unique_count]) + ) + if self.source_config.profiling.unique_proportion in column_params: + column_profile.uniqueProportion = float( + column_params[self.source_config.profiling.unique_proportion] + ) + if self.source_config.profiling.null_count in column_params: + column_profile.nullCount = int( + float(column_params[self.source_config.profiling.null_count]) + ) + if self.source_config.profiling.null_proportion in column_params: + column_profile.nullProportion = float( + column_params[self.source_config.profiling.null_proportion] + ) + if self.source_config.profiling.min in column_params: + column_profile.min = column_params[self.source_config.profiling.min] + if self.source_config.profiling.max in column_params: + column_profile.max = column_params[self.source_config.profiling.max] + if self.source_config.profiling.mean in column_params: + column_profile.mean = column_params[ + self.source_config.profiling.mean + ] + if self.source_config.profiling.median in column_params: + column_profile.median = column_params[ + self.source_config.profiling.median + ] + if self.source_config.profiling.stdev in column_params: + column_profile.stdev = column_params[ + self.source_config.profiling.stdev + ] dataset_profile.fieldProfiles.append(column_profile) @@ -914,9 +919,7 @@ def _create_profile_mcp( def get_profile_if_enabled( self, mce: MetadataChangeEventClass, database_name: str, table_name: str ) -> Iterable[MetadataWorkUnit]: - # We don't need both checks only the second one - # but then lint believes that GlueProfilingConfig can be None - if self.source_config.profiling and self.source_config.is_profiling_enabled(): + if self.source_config.is_profiling_enabled(): # for cross-account ingestion kwargs = dict( DatabaseName=database_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py index c0001f0d37531d..115d194da59587 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py @@ -7,6 +7,14 @@ class GlueProfilingConfig(ConfigModel): + enabled: bool = Field( + default=False, + description="Whether profiling should be done.", + ) + profile_table_level_only: bool = Field( + default=False, + description="Whether to perform profiling at table-level only, or include column-level profiling as well.", + ) row_count: Optional[str] = Field( default=None, description="The parameter name for row count in glue table.", diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index c688eb9a2d1150..77e8354e657a66 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -1,1342 +1,1342 @@ [ -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "glue", - "env": "PROD", - "database": "flights-database", - "param1": "value1", - "param2": "value2", - "LocationUri": "s3://test-bucket/test-prefix", - "CreateTime": "June 09, 2021 at 14:14:19" - }, - "name": "flights-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database" + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "flights-database", + "param1": "value1", + "param2": "value2", + "LocationUri": "s3://test-bucket/test-prefix", + "CreateTime": "June 09, 2021 at 14:14:19" + }, + "name": "flights-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:glue" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Database" - ] + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "glue", - "env": "PROD", - "database": "test-database", - "CreateTime": "June 01, 2021 at 14:55:02" - }, - "name": "test-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/test-database" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "test-database", + "CreateTime": "June 01, 2021 at 14:55:02" + }, + "name": "test-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/test-database" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:glue" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Database" - ] + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "glue", - "env": "PROD", - "database": "empty-database", - "CreateTime": "June 01, 2021 at 14:55:13" - }, - "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "empty-database", + "CreateTime": "June 01, 2021 at 14:55:13" + }, + "name": "empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:glue" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Database" - ] + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "CrawlerSchemaDeserializerVersion": "1.0", - "CrawlerSchemaSerializerVersion": "1.0", - "UPDATED_BY_CRAWLER": "flights-crawler", - "averageRecordSize": "55", - "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", - "classification": "avro", - "compressionType": "none", - "objectCount": "30", - "recordCount": "169222196", - "sizeKey": "9503351413", - "typeOfData": "file", - "Location": "s3://crawler-public-us-west-2/flight/avro/", - "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", - "Compressed": "False", - "NumberOfBuckets": "-1", - "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", - "BucketColumns": "[]", - "SortColumns": "[]", - "StoredAsSubDirectories": "False" - }, - "name": "avro", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database/avro", - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "flights-database.avro", - "platform": "urn:li:dataPlatform:glue", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=int].yr", - "nullable": true, - "description": "test comment", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "avro", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database/avro", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "flights-database.avro", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, - { - "fieldPath": "[version=2.0].[type=string].flightdate", - "nullable": true, - "type": { + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "description": "test comment", "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].uniquecarrier", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].flightdate", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=int].airlineid", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].uniquecarrier", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].carrier", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].airlineid", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].flightnum", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].carrier", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].origin", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].flightnum", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].year", - "nullable": true, - "description": "partition test comment", - "type": { + { + "fieldPath": "[version=2.0].[type=string].origin", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "description": "partition test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - ] + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba" + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba" + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "CrawlerSchemaDeserializerVersion": "1.0", - "CrawlerSchemaSerializerVersion": "1.0", - "UPDATED_BY_CRAWLER": "test-jsons", - "averageRecordSize": "273", - "classification": "json", - "compressionType": "none", - "objectCount": "1", - "recordCount": "1", - "sizeKey": "273", - "typeOfData": "file", - "Location": "s3://test-glue-jsons/markers/", - "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", - "Compressed": "False", - "NumberOfBuckets": "-1", - "SerdeInfo": "{'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe', 'Parameters': {'paths': 'markers'}}", - "BucketColumns": "[]", - "SortColumns": "[]", - "StoredAsSubDirectories": "False" - }, - "name": "test_jsons_markers", - "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_jsons_markers", - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_jsons_markers", - "platform": "urn:li:dataPlatform:glue", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", - "nullable": true, - "type": { + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "test-jsons", + "averageRecordSize": "273", + "classification": "json", + "compressionType": "none", + "objectCount": "1", + "recordCount": "1", + "sizeKey": "273", + "typeOfData": "file", + "Location": "s3://test-glue-jsons/markers/", + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe', 'Parameters': {'paths': 'markers'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "test_jsons_markers", + "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_jsons_markers", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test-database.test_jsons_markers", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "record" - ] + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } } - } + }, + "nativeDataType": "array,location:array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" }, - "nativeDataType": "array,location:array>>", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "double" - ] + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "double" + ] + } } - } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" }, - "nativeDataType": "array", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"array\"}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "double" - ] + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "double" + ] + } } - } - }, - "nativeDataType": "array", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"array\"}" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] } - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - ] + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "CrawlerSchemaDeserializerVersion": "1.0", - "CrawlerSchemaSerializerVersion": "1.0", - "UPDATED_BY_CRAWLER": "test", - "averageRecordSize": "19", - "classification": "parquet", - "compressionType": "none", - "objectCount": "60", - "recordCount": "167497743", - "sizeKey": "4463574900", - "typeOfData": "file", - "Location": "s3://crawler-public-us-west-2/flight/parquet/", - "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", - "Compressed": "False", - "NumberOfBuckets": "-1", - "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 'Parameters': {'serialization.format': '1'}}", - "BucketColumns": "[]", - "SortColumns": "[]", - "StoredAsSubDirectories": "False" - }, - "name": "test_parquet", - "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_parquet", - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_parquet", - "platform": "urn:li:dataPlatform:glue", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=int].yr", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "test", + "averageRecordSize": "19", + "classification": "parquet", + "compressionType": "none", + "objectCount": "60", + "recordCount": "167497743", + "sizeKey": "4463574900", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/parquet/", + "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 'Parameters': {'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "test_parquet", + "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_parquet", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test-database.test_parquet", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, - { - "fieldPath": "[version=2.0].[type=int].quarter", - "nullable": true, - "type": { + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=int].month", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].quarter", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=int].dayofmonth", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].month", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].year", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].dayofmonth", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - ] + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { - "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { - "customProperties": { - "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", - "created": "2021-06-10 16:51:25.690000", - "modified": "2021-06-10 16:55:35.307000", - "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-1.py" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1", - "description": "The first test job" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": { + "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", + "created": "2021-06-10 16:51:25.690000", + "modified": "2021-06-10 16:55:35.307000", + "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-1.py" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1", + "description": "The first test job" + } } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { - "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { - "customProperties": { - "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", - "created": "2021-06-10 16:58:32.469000", - "modified": "2021-06-10 16:58:32.469000", - "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-2.py" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2", - "description": "The second test job" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": { + "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", + "created": "2021-06-10 16:58:32.469000", + "modified": "2021-06-10 16:58:32.469000", + "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-2.py" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2", + "description": "The second test job" + } } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "f": "lambda row : ()", - "transformation_ctx": "\"Transform0\"", - "transformType": "Filter", - "nodeId": "Transform0_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:Filter-Transform0_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "f": "lambda row : ()", + "transformation_ctx": "\"Transform0\"", + "transformType": "Filter", + "nodeId": "Transform0_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:Filter-Transform0_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", - "transformation_ctx": "\"Transform1\"", - "transformType": "ApplyMapping", - "nodeId": "Transform1_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform1_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform1\"", + "transformType": "ApplyMapping", + "nodeId": "Transform1_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform1_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", - "transformation_ctx": "\"Transform2\"", - "transformType": "ApplyMapping", - "nodeId": "Transform2_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform2_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform2\"", + "transformType": "ApplyMapping", + "nodeId": "Transform2_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform2_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" - ], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "keys2": "[\"(right) flightdate\"]", - "transformation_ctx": "\"Transform3\"", - "keys1": "[\"yr\"]", - "transformType": "Join", - "nodeId": "Transform3_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:Join-Transform3_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "keys2": "[\"(right) flightdate\"]", + "transformation_ctx": "\"Transform3\"", + "keys1": "[\"yr\"]", + "transformType": "Join", + "nodeId": "Transform3_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:Join-Transform3_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", - "transformation_ctx": "\"Transform4\"", - "transformType": "ApplyMapping", - "nodeId": "Transform4_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform4_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform4\"", + "transformType": "ApplyMapping", + "nodeId": "Transform4_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform4_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"(right) yr\", \"int\"), (\"flightdate\", \"string\", \"(right) flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"(right) uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"(right) airlineid\", \"int\"), (\"carrier\", \"string\", \"(right) carrier\", \"string\"), (\"flightnum\", \"string\", \"(right) flightnum\", \"string\"), (\"origin\", \"string\", \"(right) origin\", \"string\"), (\"dest\", \"string\", \"(right) dest\", \"string\"), (\"depdelay\", \"int\", \"(right) depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"(right) carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"(right) weatherdelay\", \"int\"), (\"year\", \"string\", \"(right) year\", \"string\")]", - "transformation_ctx": "\"Transform5\"", - "transformType": "ApplyMapping", - "nodeId": "Transform5_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform5_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"(right) yr\", \"int\"), (\"flightdate\", \"string\", \"(right) flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"(right) uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"(right) airlineid\", \"int\"), (\"carrier\", \"string\", \"(right) carrier\", \"string\"), (\"flightnum\", \"string\", \"(right) flightnum\", \"string\"), (\"origin\", \"string\", \"(right) origin\", \"string\"), (\"dest\", \"string\", \"(right) dest\", \"string\"), (\"depdelay\", \"int\", \"(right) depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"(right) carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"(right) weatherdelay\", \"int\"), (\"year\", \"string\", \"(right) year\", \"string\")]", + "transformation_ctx": "\"Transform5\"", + "transformType": "ApplyMapping", + "nodeId": "Transform5_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform5_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "connection_type": "s3", - "format": "json", - "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", - "transformation_ctx": "DataSink1" - }, - "tags": [] + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "connection_type": "s3", + "format": "json", + "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", + "transformation_ctx": "DataSink1" + }, + "tags": [] + } } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "paths": "[\"yr\", \"quarter\", \"month\", \"dayofmonth\", \"dayofweek\", \"flightdate\", \"uniquecarrier\"]", - "name2": "\"Transform0Output1\"", - "name1": "\"Transform0Output0\"", - "transformation_ctx": "\"Transform0\"", - "transformType": "SplitFields", - "nodeId": "Transform0_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:SplitFields-Transform0_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "paths": "[\"yr\", \"quarter\", \"month\", \"dayofmonth\", \"dayofweek\", \"flightdate\", \"uniquecarrier\"]", + "name2": "\"Transform0Output1\"", + "name1": "\"Transform0Output0\"", + "transformation_ctx": "\"Transform0\"", + "transformType": "SplitFields", + "nodeId": "Transform0_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:SplitFields-Transform0_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"quarter\", \"int\", \"quarter\", \"int\"), (\"month\", \"int\", \"month\", \"int\"), (\"dayofmonth\", \"int\", \"dayofmonth\", \"int\"), (\"dayofweek\", \"int\", \"dayofweek\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\")]", - "transformation_ctx": "\"Transform1\"", - "transformType": "ApplyMapping", - "nodeId": "Transform1_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:ApplyMapping-Transform1_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"quarter\", \"int\", \"quarter\", \"int\"), (\"month\", \"int\", \"month\", \"int\"), (\"dayofmonth\", \"int\", \"dayofmonth\", \"int\"), (\"dayofweek\", \"int\", \"dayofweek\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\")]", + "transformation_ctx": "\"Transform1\"", + "transformType": "ApplyMapping", + "nodeId": "Transform1_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:ApplyMapping-Transform1_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" - ], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "missing_values_column": "\"dayofmonth\"", - "transformation_ctx": "\"Transform2\"", - "transformType": "FillMissingValues", - "nodeId": "Transform2_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:FillMissingValues-Transform2_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "missing_values_column": "\"dayofmonth\"", + "transformation_ctx": "\"Transform2\"", + "transformType": "FillMissingValues", + "nodeId": "Transform2_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:FillMissingValues-Transform2_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "paths": "[]", - "transformation_ctx": "\"Transform3\"", - "transformType": "SelectFields", - "nodeId": "Transform3_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:SelectFields-Transform3_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "paths": "[]", + "transformation_ctx": "\"Transform3\"", + "transformType": "SelectFields", + "nodeId": "Transform3_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:SelectFields-Transform3_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)" - ], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "connection_type": "s3", - "format": "json", - "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", - "transformation_ctx": "DataSink0" - }, - "tags": [] + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "connection_type": "s3", + "format": "json", + "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", + "transformation_ctx": "DataSink0" + }, + "tags": [] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(glue,test-job-1,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(glue,test-job-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(glue,test-job-2,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(glue,test-job-2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:baz:bob", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "baz:bob" + }, + { + "entityType": "tag", + "entityUrn": "urn:li:tag:baz:bob", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "baz:bob" + } } - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:foo:bar", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "foo:bar" + }, + { + "entityType": "tag", + "entityUrn": "urn:li:tag:foo:bar", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "foo:bar" + } } } -} -] \ No newline at end of file + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json new file mode 100644 index 00000000000000..800d36a13d2dc2 --- /dev/null +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json @@ -0,0 +1,289 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "flights-database-profiling", + "param1": "value1", + "param2": "value2", + "LocationUri": "s3://test-bucket/test-prefix", + "CreateTime": "June 09, 2021 at 14:14:19" + }, + "name": "flights-database-profiling", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database-profiling" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "avro-profiling", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database-profiling/avro-profiling", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "flights-database-profiling.avro-profiling", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "description": "test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightdate", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].uniquecarrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].airlineid", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].carrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightnum", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].origin", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "fieldProfiles": [ + { + "fieldPath": "yr", + "uniqueCount": 1, + "uniqueProportion": 2.0, + "nullCount": 0, + "nullProportion": 11.0, + "min": "1", + "max": "10", + "mean": "1", + "median": "2", + "stdev": "3" + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index b43db47ae00711..45b9899eacaa77 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -13,7 +13,11 @@ from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.sink.file import write_metadata_file -from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig +from datahub.ingestion.source.aws.glue import ( + GlueProfilingConfig, + GlueSource, + GlueSourceConfig, +) from datahub.ingestion.source.state.sql_common_state import ( BaseSQLAlchemyCheckpointState, ) @@ -38,6 +42,7 @@ get_databases_delta_response, get_databases_response, get_databases_response_for_lineage, + get_databases_response_profiling, get_databases_response_with_resource_link, get_dataflow_graph_response_1, get_dataflow_graph_response_2, @@ -54,9 +59,11 @@ get_tables_response_1, get_tables_response_2, get_tables_response_for_target_database, + get_tables_response_profiling_1, resource_link_database, tables_1, tables_2, + tables_profiling_1, target_database_tables, ) @@ -93,6 +100,42 @@ def glue_source( ) +def glue_source_with_profiling( + platform_instance: Optional[str] = None, + use_s3_bucket_tags: bool = False, + use_s3_object_tags: bool = False, + extract_delta_schema_from_parameters: bool = False, +) -> GlueSource: + profiling_config = GlueProfilingConfig( + enabled=True, + profile_table_level_only=False, + row_count="row_count", + column_count="column_count", + unique_count="unique_count", + unique_proportion="unique_proportion", + null_count="null_count", + null_proportion="null_proportion", + min="min", + max="max", + mean="mean", + median="median", + stdev="stdev", + ) + + return GlueSource( + ctx=PipelineContext(run_id="glue-source-test"), + config=GlueSourceConfig( + aws_region="us-west-2", + extract_transforms=False, + platform_instance=platform_instance, + use_s3_bucket_tags=use_s3_bucket_tags, + use_s3_object_tags=use_s3_object_tags, + extract_delta_schema_from_parameters=extract_delta_schema_from_parameters, + profiling=profiling_config, + ), + ) + + column_type_test_cases: Dict[str, Tuple[str, Type]] = { "char": ("char", StringTypeClass), "array": ("array", ArrayTypeClass), @@ -641,3 +684,41 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: output_path=tmp_path / mce_file, golden_path=test_resources_dir / mce_golden_file, ) + + +@freeze_time(FROZEN_TIME) +def test_glue_ingest_with_profiling( + tmp_path: Path, + pytestconfig: PytestConfig, +) -> None: + glue_source_instance = glue_source_with_profiling() + mce_file = "glue_mces.json" + mce_golden_file = "glue_mces_golden_profiling.json" + with Stubber(glue_source_instance.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_response_profiling, {}) + + glue_stubber.add_response( + "get_tables", + get_tables_response_profiling_1, + {"DatabaseName": "flights-database-profiling"}, + ) + + glue_stubber.add_response( + "get_table", + {"Table": tables_profiling_1[0]}, + {"DatabaseName": "flights-database-profiling", "Name": "avro-profiling"}, + ) + + mce_objects = [wu.metadata for wu in glue_source_instance.get_workunits()] + + glue_stubber.assert_no_pending_responses() + + write_metadata_file(tmp_path / mce_file, mce_objects) + + # Verify the output. + test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_file, + golden_path=test_resources_dir / mce_golden_file, + ) diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index f44a384a02c4ab..46ab65234c22df 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -973,6 +973,112 @@ get_tables_lineage_response_1 = {"TableList": tables_lineage_1} +get_databases_response_profiling = { + "DatabaseList": [ + { + "Name": "flights-database-profiling", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 14, 19), + "CreateTableDefaultPermissions": [ + { + "Principal": { + "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS" + }, + "Permissions": ["ALL"], + } + ], + "CatalogId": "123412341234", + "LocationUri": "s3://test-bucket/test-prefix", + "Parameters": {"param1": "value1", "param2": "value2"}, + }, + ] +} + +tables_profiling_1 = [ + { + "Name": "avro-profiling", + "DatabaseName": "flights-database-profiling", + "Owner": "owner", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "UpdateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "LastAccessTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "Retention": 0, + "StorageDescriptor": { + "Columns": [ + { + "Name": "yr", + "Type": "int", + "Comment": "test comment", + "Parameters": { + "unique_proportion": "2", + "min": "1", + "median": "2", + "max": "10", + "mean": "1", + "null_proportion": "11", + "unique_count": "1", + "stdev": "3", + "null_count": "0", + }, + }, + {"Name": "flightdate", "Type": "string"}, + {"Name": "uniquecarrier", "Type": "string"}, + {"Name": "airlineid", "Type": "int"}, + {"Name": "carrier", "Type": "string"}, + {"Name": "flightnum", "Type": "string"}, + {"Name": "origin", "Type": "string"}, + ], + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": False, + "NumberOfBuckets": -1, + "SerdeInfo": { + "SerializationLibrary": "org.apache.hadoop.hive.serde2.avro.AvroSerDe", + "Parameters": { + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "serialization.format": "1", + }, + }, + "BucketColumns": [], + "SortColumns": [], + "Parameters": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + }, + "StoredAsSubDirectories": False, + }, + "PartitionKeys": [], + "TableType": "EXTERNAL_TABLE", + "Parameters": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + }, + "CreatedBy": "arn:aws:sts::123412341234:assumed-role/AWSGlueServiceRole-flights-crawler/AWS-Crawler", + "IsRegisteredWithLakeFormation": False, + "CatalogId": "123412341234", + } +] +get_tables_response_profiling_1 = {"TableList": tables_profiling_1} + + def mock_get_object_response(raw_body: str) -> Dict[str, Any]: """ Mock s3 client get_object() response object.